The purpose of this document is to get oriented with the first RNA-Seq data to come out of the Klotho project and with using Synapse. In this workflow we:

  1. Download mouse metadata and expression data
  2. Count mice in age, sex and genotype groups.
  3. Bin the mice into age groups (4 and 12 months).
  4. Examine expression data for batch effects
  5. Normalize and scale expression data.
  6. Calculate differential expression across all genotype groups using ANOVA
  7. Look for enrichment of differentially expressed genes.
  8. Compare expression of AD genes across Klotho genotypes
  9. Look for differential expression in biodomain genes across genotypes.
  10. Generate mouse gene lists for biodomains, KEGG pathways, intersections between GO terms and biodomains, and intersections between boidomains and KEGG pathways.
rm(list = ls())

library(here)

#expression.type = "transcript"
expression.type = "gene"

min.term.size = 10 #minimum number of genes allowed in a term

fdr.thresh = 0.1 #fdr for differentially expressed genes
p.thresh = 0.05 #nominal p value to look for effects

args <- commandArgs(trailingOnly=T)
subgroup <- args[2]

if(is.na(subgroup)){
  #subgroup <- "all ages"
  subgroup <- list("age_batch" = 12)
  #subgroup <- list("age_batch" = 4)
  #subgroup <- list("age_batch" = 4, "sex_ge" = "Female") #example with more than one filter
}

if(subgroup == "all ages"){
  results.dir <- here("Results", "all")
}else{
  subgroup.results <- paste(sapply(1:length(subgroup), 
    function(x) paste(names(subgroup)[x], subgroup[[x]], sep = "_")), 
    collapse = "_")
  results.dir <- here("Results", subgroup.results)
}

if(!file.exists(results.dir)){
  dir.create(results.dir, TRUE)
  }

general.data.dir <- here("Data", "general")

remove.qc <- TRUE #if 1a_Klotho_QC.Rmd has been run, and animals were identified for removal
                  #set this to TRUE

ordered.geno <- c("FC/FC", "WT/FC", "WT/WT", "WT/VS", "VS/VS")

The subgroup analyzed in this workflow is: age_batch = 12

all.fun <- list.files(here("Code"), full.names = TRUE)
for(i in 1:length(all.fun)){source(all.fun[i])}

processed.data.dir <- file.path(results.dir, "processed_data")
if(!file.exists(processed.data.dir)){dir.create(processed.data.dir, recursive = TRUE)}

general.processed.data.dir <- here("Results", "Processed_Data")
if(!file.exists(general.processed.data.dir)){dir.create(general.processed.data.dir)}

geno.cols <- get_geno_cols(here("Data", "general", "geno_cols.csv"))
needed.libraries <- c("synapser", "pheatmap", "DESeq2", "DT", "vioplot", "RColorBrewer",
  "gprofiler2", "cluster", "pathview", "clusterProfiler", "stringr", "igraph", "wordcloud",
  "wordcloud2")
load_libraries(needed.libraries)
kl.ensembl <- "ENSMUSG00000058488"

#gene information tables
mouse.gene.table <- read.table(file.path(general.data.dir, "mouse_gene_info.txt"), sep = "\t", header = TRUE)
orthos <- read.table(file.path(general.data.dir, "human.mouse.orthologs.txt"), sep = "\t", header = TRUE)
data_id <- read.table(here("Data", "Synspse_IDs_Klotho.txt")) #made by hand by copying and pasting file names and IDs from Synapse web page
mouse.data.dir <- here("Data", "Mouse")
if(!file.exists(mouse.data.dir)){dir.create(mouse.data.dir)}

file.dest <- file.path(mouse.data.dir, data_id[,1])
need.to.download <- any(!file.exists(file.dest))

if(need.to.download){
  synLogin()
  logged.in <- TRUE

  for(i in 1:length(file.dest)){
    
    if(!file.exists(file.destp[i])){
      synGet(data_id[i,2], downloadLocation = mouse.data.dir)
    }
  }
}
mouse.info <- read.csv(file.path(mouse.data.dir, "metadata_validated.csv"))
orthos <- read.delim(file.path(general.data.dir, "human.mouse.orthologs.txt"))
if(expression.type == "transcript"){
  count.data <- read.delim(file.path(mouse.data.dir, "rsem.merged.transcript_counts.tsv"))
}else{
  count.data <- read.delim(file.path(mouse.data.dir, "rsem.merged.gene_counts.tsv"))
}
u_id <- unique(mouse.info[,"animalName"])
ind.tables <- lapply(u_id, function(x) mouse.info[which(mouse.info[,"animalName"] == x),])
#This function takes all the information on a single individual and condenses
#it into a more readable format. Each individual has two lines in the table
#one each for the two KL haplotypes. The two lines tell us the genotype.
#we go by the sequenced genotype not by what is in Climb because that is
#more reliable. 
#for each of these haplotypes separately.
condense_geno <- function(call, allele){
  if(is.na(call)){
    return("Inconclusive")
  }
  if(call == "WT"){
    cond.geno <- "WT/WT"
  }
  if(call == "Homozygous"){
    cond.geno <- paste0(allele, "/", allele)
  }
  if(call == "Heterozygous"){
    cond.geno <- paste0("WT/", allele)
  }
  return(cond.geno)
}

merge.geno <- function(fc.geno, vs.geno){
  #if both are wild type
  u_geno <- unique(c(fc.geno, vs.geno))
  if(any(is.na(u_geno))){
    return("Inconclusive")
  }
  if(any(u_geno == "Inconclusive")){
    return("Inconclusive")
  }
  if(length(u_geno) == 1){
    final.geno <- u_geno
  }else{
    not.wt <- which(u_geno != "WT/WT")
    if(length(not.wt) == 1){
      final.geno <- u_geno[not.wt]
    }else{
      final.geno <- "FC/VS"
    }
  }
  return(final.geno)
}


condense_info <- function(individ.table){
  vs.idx <- which(individ.table[,"snp"] == "Klotho-VS")
  fc.idx <- which(individ.table[,"snp"] == "Klotho-FC")
  vs.seq <- condense_geno(call = individ.table[vs.idx,"genotype_seq"], allele = "VS")
  fc.seq <- condense_geno(individ.table[fc.idx,"genotype_seq"], "FC")
  seq.geno <- merge.geno(fc.seq, vs.seq)

  vs.climb <- condense_geno(call = individ.table[vs.idx,"genotype_climb"], allele = "VS")
  fc.climb <- condense_geno(individ.table[fc.idx,"genotype_climb"], "FC")
  climb.geno <- merge.geno(fc.climb, vs.climb)

  #dont.include <- c("genotype_climb", "genotype_seq", "snp", "validated.gt", "line")
  dont.include <- c("genotype_climb", "genotype_seq", "snp", "line")
  include <- setdiff(colnames(individ.table), dont.include)
  final.info <- unlist(c(individ.table[1,include], "seq_genotype" = seq.geno, "climb_geno" = climb.geno))
  return(final.info)
}

condensed.table <- t(sapply(ind.tables, condense_info))
#head(condensed.table)
#one mouse was labeled as 24 months, but it should be 12 months (talked with Dylan Garceau)
#change this by hand

wrong.age <- which(condensed.table[,"age_m"] == 24)
condensed.table[wrong.age,"age_m"] <- 12

#remove animals identified in 1a_Klotho_QC.Rmd
if(remove.qc){
  #remove animals identified by 1a_Klotho_QC.Rmd
  qc.id <- read.table(here("Data", "general", "ID_QC.txt"), header = TRUE)
  for(i in 1:nrow(qc.id)){
    mouse.id <- qc.id[i,1]
    action <- qc.id[i,2]
    mouse.idx <- which(condensed.table[,"animalName"] == mouse.id)
    if(action == "remove"){
      condensed.table <- condensed.table[-mouse.idx,]
    }else{
      condensed.table[mouse.idx,"sex_climb"] <- action
    }
  }
}

#intersect(qc.id[,1], condensed.table[,"animalName"])

The table of animals with called genotypes is as follows:

datatable(condensed.table)
write.table(condensed.table, file.path(results.dir, "mouse.info.csv"), sep = ",", quote = FALSE)

Group Counts

The tables below show the numbers of animals in each group. The x axis in each panel shows the age of the mice in months. The y axis shows each included genotype.

The largest group right now is 12-month-old WT mice. There are no FC/VS mice yet.

sexes <- unique(condensed.table[,"sex_ge"]) #use the sequenced sex
ages <- sort(as.numeric(unique(condensed.table[,"age_m"])))
genotypes <- unique(condensed.table[,"climb_geno"])[c(2,5,4,1,3,6)]

#build a separate table for each allele and sex and count the
#mice in each age group.
count.tables <- vector(mode = "list", length = length(sexes))
names(count.tables) <- sexes
for(s in 1:length(sexes)){
    sex.idx <- which(condensed.table[,"sex_climb"] == sexes[s])
    by.age <- matrix(0, nrow = length(genotypes), ncol = length(ages))
    rownames(by.age) <- genotypes
    colnames(by.age) <- ages
    for(g in 1:length(genotypes)){
      genotype.idx <- grep(genotypes[g], condensed.table[,"climb_geno"])
      for(ag in 1:length(ages)){
        age.idx <- which(condensed.table[,"age_m"] == ages[ag])
        num.in.group <- length(Reduce("intersect", list(sex.idx, genotype.idx, age.idx)))
        by.age[g,ag] <- num.in.group
      }
    }
    count.tables[[s]]  <- by.age
}
#quartz(width = 9, height = 3)
par(mfrow = c(1,2), mar = c(2,8,4,2))
for(s in 1:length(sexes)){
  imageWithText(count.tables[[s]], use.pheatmap.colors = TRUE,
    main = sexes[s], cex = 1, main.shift = 0.15, row.text.shift = 0.1,
    col.text.rotation = 0, col.text.shift = 0.15, col.text.adj = 0.5)
}

inc.idx <- which(condensed.table[,"climb_geno"] == "Inconclusive")
if(length(inc.idx) > 0){
  condensed.table <- condensed.table[-inc.idx,,drop=FALSE]
}

We removed 0 entries with inconclusive genotypes.

Bin ages

The age groups are 4 and 12 months so far. Twenty-four months will be coming later. Here we bin the animals into those age groups.

age.groups <- c(4, 12, 24)
age.diff <- lapply(age.groups, function(x) abs(x-as.numeric(condensed.table[,"age_m"])))
age.which <- lapply(age.diff, function(x) which(x <= 1))
age.batch <- rep(NA, nrow(condensed.table))
for(i in 1:length(age.which)){
  age.batch[age.which[[i]]] <- age.groups[i]
}
condensed.table <- cbind(condensed.table, "age_batch" = age.batch)
sexes <- unique(condensed.table[,"sex_ge"])
ages <- sort(as.numeric(unique(condensed.table[,"age_batch"])))
genotypes <- unique(condensed.table[,"climb_geno"])[c(2,5,4,1,3,6)]

#build a separate table for each allele and sex and count the
#mice in each age group.
count.tables <- vector(mode = "list", length = length(sexes))
names(count.tables) <- sexes
for(s in 1:length(sexes)){
    sex.idx <- which(condensed.table[,"sex_climb"] == sexes[s])
    by.age <- matrix(0, nrow = length(genotypes), ncol = length(ages))
    rownames(by.age) <- genotypes
    colnames(by.age) <- ages
    for(g in 1:length(genotypes)){
      genotype.idx <- grep(genotypes[g], condensed.table[,"climb_geno"])
      for(ag in 1:length(ages)){
        age.idx <- which(condensed.table[,"age_m"] == ages[ag])
        num.in.group <- length(Reduce("intersect", list(sex.idx, genotype.idx, age.idx)))
        by.age[g,ag] <- num.in.group
      }
    }
    count.tables[[s]]  <- by.age
}

full.count.table <- Reduce("cbind", count.tables)
colnames(full.count.table) <- paste(rep(names(count.tables), 
  each = length(count.tables)), colnames(full.count.table), sep = "_")
keep.rows <- which(!is.na(row.names(full.count.table)))
write.table(full.count.table[keep.rows,], 
  here("Results", "for_paper", "mouse_count_table.txt"), sep = "\t",
  quote = FALSE)

The following grid shows how many animals are in each age batch. We will use the age batch as a covariate going forward.

#quartz(width = 8, height = 4)
par(mfrow = c(1,2), mar = c(2,8,4,2))
for(s in 1:length(sexes)){
  imageWithText(count.tables[[s]], use.pheatmap.colors = TRUE,
    main = sexes[s], cex = 1, main.shift = 0.15, row.text.shift = 0.55,
    col.text.rotation = 0, col.text.shift = 0.15, col.text.adj = 0.5)
}

Select Subgroup

If there is a subgroup analysis specified in the parameter section, we select that here.

if(subgroup != "all ages"){
  sub.idx <- Reduce("intersect", lapply(1:length(subgroup), 
    function(x) which(condensed.table[,names(subgroup)[x]] == subgroup[[x]])))
  #overwrite condensed.table
  if(length(sub.idx))
  full.table <- condensed.table
  condensed.table <- condensed.table[sub.idx,]
}

Transcript Data

Here we use transcript count data. These data were generated by Annat Haber and Catrina Spruce. The details of the analysis can be found on Synapse with ID syn52749871.

count.num <- as.matrix(count.data[,3:ncol(count.data)])
count.id <- as.matrix(count.data[,1:2])
rownames(count.num) <- count.id[,1] #name using transcript IDs
colnames(count.num) <- gsub("X", "", colnames(count.num))

#make sure mouse IDs are aligned across the information table
#and data table.
included.idx <- match(condensed.table[,1], colnames(count.num))
#cbind(colnames(count.num)[included.idx], condensed.table[,1])
sub.count <- count.num[,included.idx]

Use DESeq2 to filter out low-expressing genes and to normalize. We use the vst normalization.

#make a data frame with relevant covariate and group data.
info <- data.frame("sequencingBatch" = as.factor(condensed.table[,"sequencingBatch"]),
  "sex_ge" = as.factor(condensed.table[,"sex_ge"]),
  "age_batch" = as.numeric(condensed.table[,"age_batch"]),
  "climb_geno" = factor(condensed.table[,"climb_geno"], 
  levels = c("WT/WT", "WT/FC", "FC/FC", "WT/VS", "VS/VS"))) #put WT as the reference level
rownames(info) <- condensed.table[,"animalName"]

write.table(info, file.path(processed.data.dir, "mouse_info.csv"), sep = ",", row.names = TRUE)

# Create design matrices: Include both the condition of interest and the batch variable.
# Assuming 'genotype' is your condition and 'sequencingBatch' is the batch factor
# age_m and sex_ge are potential covariates

possible.covar <- c("sequencingBatch", "sex_ge", "age_batch", "climb_geno")
use.covar <- setdiff(possible.covar, names(subgroup))
fmla <- as.formula(paste0("~", paste(use.covar, collapse = " + ")))
design.genotype <- model.matrix(fmla, data=info)

#I tested putting different variables as the "group" variable, 
#and all normalizations were identical.
dds <- DESeqDataSetFromMatrix(round(sub.count), colData = info, design = design.genotype)
## converting counts to integer mode
#nrow(dds)

smallestGroupSize <- 4
keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize
dds <- dds[keep,]
#nrow(dds)

sub.id <- count.id[keep,]

vsd <- vst(dds, blind = FALSE)

#pre.norm.decomp <- plot.decomp(t(assay(dds)), plot.results = FALSE)
#post.norm.decomp <- plot.decomp(t(assay(vsd)), plot.results = FALSE)

#take invariant covariates out of info
num.var <- apply(info, 2, function(x) length(unique(x)))
keep.var <- which(num.var > 1)
covar.mat <- info[,keep.var]
write.table(covar.mat, file.path(results.dir, "covariates.csv"), sep = ",", quote = FALSE)
#gather information about transcripts in filtered data set

gene.info.file <- file.path(general.data.dir, "mouse_gene_info.txt")
if(!file.exists(gene.info.file)){
  library(biomaRt)
  all.var <- ls()

  lib.loaded <- as.logical(length(which(all.var == "mus")))
  if(!lib.loaded){
      mus <- useEnsembl(biomart="ensembl", dataset="mmusculus_gene_ensembl") 
      }
  
  gene.info <- getBM(c("ensembl_gene_id", "external_gene_name", "entrezgene_id", "chromosome_name", 
    "start_position", "end_position"), "ensembl_gene_id", sub.id[,1], mus)
  write.table(gene.info, gene.info.file, quote = FALSE, row.names = FALSE, sep = "\t")
  }else{
  gene.info <- read.delim(gene.info.file)
  }

Batch Effects

We looked at the relationships between the first four PCs of the normalized gene expression matrix and relevant variables. The plots are big, so these are printed to pdfs in the results folder.

top.var.alpha.thresh <- 0.05

Depending on how many of the top variable genes we use, the factors are correlated with different PCs. Here we use all genes for which the model explaining expression has a p value less than 0.05.

#top genes can be a number specifying how many of the most 
#variable genes to use in the decomposition.
#gene.names can be used to specify specific genes to be
#used. This can help in identifying sample swaps in QC

plot_pc_factors <- function(expr.mat, factors, n.pc = 4, top.genes = NULL,
  highlight.ind = NULL, highlight.col = "blue"){
  
  if(!is.data.frame(factors)){stop("factors must be a data frame.")}
  
  main.text <- ""

  if(!is.null(top.genes)){
    gene.var <- apply(expr.mat, 2, var)
    sorted.var <- sort(gene.var, decreasing = TRUE)[1:top.genes]
    expr.mat <- expr.mat[,names(sorted.var)]
    main.text <- paste("\n", top.genes, "most variable genes")
  }

  expr.decomp <- plot.decomp(expr.mat, pc = n.pc, plot.results = FALSE)
  pc.var.exp <- signif(expr.decomp$var.exp*100, 2)[1:n.pc]

  pc.pairs <- pair.matrix(1:n.pc)
  pc.factor.cor <- matrix(ncol = n.pc, nrow = ncol(factors))
  colnames(pc.factor.cor) <- paste0("PC", 1:n.pc)
  rownames(pc.factor.cor) <- colnames(factors)

  #for each factor, plot out the pairs of PCs and individual PCs
  for(i in 1:ncol(factors)){
    cat("###", colnames(factors)[i], "\n")

    #treat everything, even age as a factor
    if(colnames(factors)[i] == "climb_geno"){
      factor.cols <- geno.cols
      factor.names <- names(factor.cols)
    }else{
        factor.names <- levels(as.factor(factors[,i]))
        factor.cols <- c("black", "#CE5C6D", "#7ECC61")
        names(factor.cols) <- factor.names
    }
    gcol <- rep(NA, nrow(factors))
    for(g in 1:length(factor.cols)){
      gcol[which(factors[,i] == names(factor.cols)[g])]  <- factor.cols[g]
    }
  

    #if we are highlighting individuals, change their color here    
    if(!is.null(highlight.ind)){
      ind.idx <- which(rownames(expr.mat) %in% highlight.ind)
      gcol[ind.idx] <- highlight.col
    }

    if(i < ncol(factors)){
      layout.mat <- get.layout.mat(nrow(pc.pairs)+n.pc+2)
    }else{
      layout.mat <- get.layout.mat(nrow(pc.pairs)+n.pc+3)
    }
    layout(layout.mat)

    par(mar = c(4,4,4,2))
    for(p in 1:nrow(pc.pairs)){
      pc1 <- pc.pairs[p,1]
      pc1.var <- pc.var.exp[pc1]
      pc2 <- pc.pairs[p,2]
      pc2.var <- pc.var.exp[pc2]
      if(is.factor(factors[,i])){
        plot(expr.decomp$u[,pc1], expr.decomp$u[,pc2], pch = 16, xlab = paste0("PC", pc1, " (", pc1.var, "%)"),
          ylab = paste0("PC", pc2,  " (", pc2.var, "%)"), col = gcol,
          main = paste(colnames(factors)[i], main.text))
      }else{
        plot(expr.decomp$u[,pc1], expr.decomp$u[,pc2], pch = 16, 
          xlab = paste0("PC", pc1, " (", pc1.var, "%)"),
          ylab = paste0("PC", pc2,  " (", pc2.var, "%)"), col = gcol, 
          main = paste(colnames(factors)[i], main.text))
      }
    }
    
    #add barplots
    for(p in 1:n.pc){
      if(length(levels(as.factor(factors[,i]))) > 1){
        model <- lm(expr.decomp$u[,p]~factors[,i])
        pc.factor.cor[i,p] <- summary(model)$adj.r.squared
      }
      
      pc.order <- order(expr.decomp$u[,p])
      
        barplot(expr.decomp$u[pc.order,p],
          col = gcol[pc.order],
          main = paste("PC", p, "\n", colnames(factors)[i], main.text), border = NA)
          #factors[pc.order[intersect(which(expr.decomp$u[pc.order,p] < 0), which(gcol[pc.order] == "black"))],]
          #factors[pc.order[intersect(which(expr.decomp$u[pc.order,p] > 0), which(gcol[pc.order] != "black"))],]

        #find mis-matches
        #not.neg <- intersect(which(expr.decomp$u[,1] < 0), which(gcol == "#CE5C6D"))
        #rownames(expr.mat)[not.neg]
        #not.pos <- intersect(which(expr.decomp$u[,1] > 0), which(gcol == "black"))
        #rownames(expr.mat)[not.pos]
    }

    #add legend    
    par(mar = c(0,0,2,2))
    plot.new()
    plot.window(xlim = c(0,1), ylim = c(0,1))
    legend(0, 1, legend = factor.names, pch = 16, col = factor.cols)
    

    #show correlate each PC with each factor
    par(mar = c(4,4,2,2))
    if(!all(is.na(pc.factor.cor[i,]))){
      a <- barplot(abs(pc.factor.cor[i,]), ylab = "Adjusted R^2", 
        main = paste0("Adjusted R^2"), ylim = c(0,1),
        names = paste0("PC", 1:n.pc, "\n", pc.var.exp, "%"))
      text(a, abs(pc.factor.cor[i,])+0.04, labels = signif(abs(pc.factor.cor[i,]), 2))
      #text(a, rep(0.04, length(a)), labels = paste(signif(pc.var.exp, 2), "%"))
    }

    cat("\n\n")
  
    if(i == ncol(factors)){
      par(mar = c(6,8,6,2))
      scaled.factor.f <- t(apply(abs(pc.factor.cor), 1, function(x) x/max(x)))
      imageWithText(signif(scaled.factor.f, 2), use.pheatmap.colors = TRUE, col.text.rotation = 0,
        col.text.adj = 0.5, col.text.shift = 0.2, row.text.shift = 0.2, main.shift = 0.2,
        main = "PC-Factor Correlations")
    }
  }
}

We fit a model to explain the expression of each gene with sequencing batch, sex, age, and genotype. We selected all genes that had a nominally significant p value (p < 0.05) for the entire model.

plotting.df <- data.frame("sequencingBatch" = as.factor(condensed.table[,"sequencingBatch"]),
  "sex_ge" = as.factor(condensed.table[,"sex_ge"]),
  "age_batch" = as.numeric(condensed.table[,"age_batch"]),
  "climb_geno" = factor(condensed.table[,"climb_geno"], 
    levels = c("WT/WT", "WT/FC", "FC/FC", "WT/VS", "VS/VS"))) #make WT the reference levl
rownames(plotting.df) <- condensed.table[,"animalName"]


#test genes for variability based on any of the possible causal factors
#then filter for the genes that vary by these, not just variable genes
#in general

expr <- assay(vsd)

models <- apply(expr, 1, function(x) lm(x~sequencingBatch+sex_ge+age_batch+climb_geno, data = plotting.df))
model.f <- lapply(models, function(x) summary(x)$fstatistic)
model.p <- sapply(model.f, function(f) pf(f[1],f[2],f[3],lower.tail=F))
#qqunif.plot(model.p)
#big.p <- which(-log10(model.p) > 10)
sig.idx <- which(model.p < top.var.alpha.thresh)

#checked to see if VS/FC carrier status was better represented in the PCs
#It is not
#carrier.status <- rep("WT", nrow(plotting.df))
#carrier.status[grep("VS", plotting.df[,"climb_geno"])] <- "VS"
#carrier.status[grep("FC", plotting.df[,"climb_geno"])] <- "FC"
#plotting.df <- cbind(plotting.df, "carrier.status" = as.factor(carrier.status))

pdf(file.path(results.dir, 
  paste0("PCs_and_factors_pval_less_than_", top.var.alpha.thresh, ".pdf")), width = 12, height = 8)
plot_pc_factors(expr.mat = t(expr[sig.idx,]), factors = plotting.df, n.pc = 4)

sequencingBatch

sex_ge

age_batch

climb_geno

dev.off()

quartz_off_screen 2

Variance explained by genotype

If we have selected to analyze all ages, the plot below shows how well genotype explains variation in PC1 of the transcription matrix at different ages.

if(subgroup == "all ages"){
  expr.decomp <- plot.decomp(t(expr[sig.idx,]), plot.results = FALSE)
  model <- lm(expr.decomp$u[,1]~as.factor(condensed.table[,"climb_geno"])*as.factor(condensed.table[,"age_batch"]))
  #boxplot(expr.decomp$u[,1]~as.factor(condensed.table[,"climb_geno"])*as.factor(condensed.table[,"age_batch"]))
  ylim <- c(min(expr.decomp$u[,1]), max(expr.decomp$u[,1]))

  geno <- condensed.table[,"climb_geno"]
  age <- condensed.table[,"age_batch"]

  u_age <- unique(age)

  #quartz(width = 10, height = 5)
  par(mfrow = c(1,2), mar = c(6,4,4,2))
  for(a in 1:length(u_age)){
    age.idx <- which(age == u_age[a])
    age.geno <- geno[age.idx]
    age.pc <- expr.decomp$u[age.idx,1]
    age.geno.pc <- lapply(names(geno.cols), function(x) age.pc[which(age.geno == x)])
    age.model <- lm(age.pc~as.factor(age.geno))
    age.r2 <- summary(age.model)$adj.r.squared
    age.f <- summary(age.model)$fstatistic
    age.p <- pf(age.f[1], age.f[2], age.f[3],lower.tail=F)
    boxplot(age.geno.pc, names = names(geno.cols), col = geno.cols, ylab = "PC1",
      main = paste0("PC1 by genotype at ", u_age[a], " months\nR2 = ", 
      signif(age.r2, 2), "; p = ", signif(age.p, 2)), ylim = ylim) 
    abline(h = 0)
  }
}

This yielded 7569 genes. We used this subset of genes in the PC plots to see which factors contributed to overall variation in gene expression.

Klotho variation

The following plots show how Klotho expression varies with age, sex, and genotype. There isn’t much variation in Klotho transcription in these animals. That’s probably fine. The Klotho variants we are dealing with are coding variants and probably don’t have any effect on gene expression.

norm.expr <- assay(vsd)
saveRDS(norm.expr, file.path(processed.data.dir, "Normalized_Expression.RDS"))

#scale expression across individuals
scaled.expr <- t(apply(norm.expr, 1, scale))
dimnames(scaled.expr) <- dimnames(norm.expr)
saveRDS(scaled.expr, file.path(processed.data.dir, "Scaled_Expression.RDS"))

if(expression.type == "transcript"){
  tx.id <- sub.id[which(sub.id[,"gene_id"] == kl.ensembl),"transcript_id"]  
}else{
  tx.id <- sub.id[which(sub.id[,"gene_id"] == kl.ensembl),"gene_id"]  
}

kl.levels <- scaled.expr[tx.id,,drop=FALSE]

if(expression.type == "transcript"){
  plot.with.model(kl.levels[1,], kl.levels[2,], xlab = "Kl transcript 1",
  ylab = "Kl transcript 2")
}

plot_tx_with_factor <- function(expr.mat, covar.table, tx_name, factor_name, 
  factor_type = "categorical", ylab = "Count",
  tx_label = "Transcript", pt_col = "#c51b8a", cex.labels = 1){

    #make a dummy matrix to adjust expression 
    factor.idx <- which(colnames(covar.table) == factor_name)
    dummy.mat <- dummy_covar(as.matrix(covar.table[,-c(factor.idx)]))

  for(i in 1:length(tx_name)){

    if(factor_type == "categorical"){
      model <- lm(expr.mat[tx_name[i],]~covar.table[,factor_name])
      model.p <- signif(anova(model)$"Pr(>F)"[1], 2)
      vioplot(expr.mat[tx_name[i],]~covar.table[,factor_name], xlab = "", 
        main = paste(tx_label[i], "\np =", model.p), col = "lightgray",
        cex.names = cex.labels, cex.axis = cex.labels, ylab = "")
      mtext(ylab, side = 2, line = 2.5, cex = cex.labels)
      stripchart(expr.mat[tx_name[i],]~covar.table[,factor_name], 
        col = pt_col, vertical = TRUE, pch = 16, method = "jitter", add = TRUE)
    }
    if(factor_type == "numeric"){
      model <- lm(expr.mat[tx_name[i],]~as.numeric(covar.table[,factor_name]))
      model.p <- signif(anova(model)$"Pr(>F)"[1], 2)
      vioplot(expr.mat[tx_name[i],]~as.numeric(covar.table[,factor_name]), xlab = "", 
        main = paste(tx_label[i], "\np =", model.p), col = "lightgray",
        cex.names = cex.labels, cex.axis = cex.labels, ylab = "")
      mtext(ylab, side = 2, line = 2.5, cex = cex.labels)
      stripchart(expr.mat[tx_name[i],]~as.numeric(covar.table[,factor_name]), 
        col = pt_col, vertical = TRUE, pch = 16, method = "jitter", add = TRUE)
    }
  }
}

Genotype

plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat, 
  tx_name = tx.id, "climb_geno", 
  tx_label = paste0("Kl Transcript", 1:length(tx.id)), 
  ylab = "Expression (A.U.)", order.by.mean = FALSE)

if(subgroup[[1]] == 12){
  pdf(here("Results", "for_paper", 
    paste0("Klotho_expression_by_genotype_", basename(results.dir), "_months.pdf")), 
    width = 5, height = 6)
    par(bg = NA)
    plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat, 
      tx_name = tx.id, "climb_geno", 
      tx_label = paste0("Kl Transcript", 1:length(tx.id)), 
      ylab = "Expression (A.U.)", order.by.mean = FALSE)
  dev.off()
}
## quartz_off_screen 
##                 2

Sex

par(mfrow = c(1,2))
plot_tx_with_factor(expr.mat = scaled.expr, covar.table = covar.mat, 
  tx_name = tx.id, factor_name = "sex_ge", 
  tx_label = paste0("Kl Transcript", 1:length(tx.id)), 
  ylab = "Expression (A.U.)")

if(subgroup[[1]] == 12){
  pdf(here("Results", "for_paper", 
    paste0("Klotho_expression_by_sex_", basename(results.dir), "_months.pdf")), 
    width = 5, height = 5)
    par(bg = NA)
  plot_tx_with_factor(expr.mat = scaled.expr, covar.table = covar.mat, 
    tx_name = tx.id, factor_name = "sex_ge", 
    tx_label = paste0("Kl Transcript", 1:length(tx.id)), 
    ylab = "Expression (A.U.)", cex.labels = 1.5)
  dev.off()
}
## quartz_off_screen 
##                 2

Age

age.varies <- which(colnames(covar.mat) == "age_batch")
if(length(age.varies) > 0){
  par(mfrow = c(1,2))
  plot_tx_with_factor(expr.mat = scaled.expr, covar.table = covar.mat, 
    tx.id, "age_batch", "numeric", 
    paste0("Kl Transcript", 1:length(tx.id)), 
    ylab = "Expression (A.U.)", cex.labels = 1.5)
}else{
  plot.text("Age does not vary in this population")
}

if(length(age.varies) > 0){
  pdf(here("Results", "for_paper", 
    paste0("Klotho_expression_by_sex_", basename(results.dir), "_months.pdf")), 
    width = 5, height = 5)
    par(bg = NA)
  age.varies <- which(colnames(covar.mat) == "age_batch")
  plot_tx_with_factor(expr.mat = scaled.expr, covar.table = covar.mat, 
    tx.id, "age_batch", "numeric", 
    paste0("Kl Transcript", 1:length(tx.id)), 
    ylab = "Expression (A.U.)", cex.label = 1.5)
  dev.off()
}

Differential Expression

We tested for differential expression with the different factors. For each factor, we adjusted the expression matrix for the other factors and test for the final factor.

test_full_model <- function(expr.mat, covar.mat){
  
  full.model.file <- file.path(results.dir, "Full_Model_Results.RDS")
  
  if(!file.exists(full.model.file)){
    #match covariates to adjusted expression matrix
    common.ind <- intersect(colnames(expr.mat), rownames(covar.mat))
    expr.idx <- match(common.ind, colnames(expr.mat))
    covar.idx <- match(common.ind, rownames(covar.mat))

    #adjust for sequencing batch
    dummy.batch <- dummy_covar(covar.mat[covar.idx,"sequencingBatch",drop=FALSE])

    adj.expr.file <- file.path(processed.data.dir, "Batch_Adjusted_Expr.RDS")
    if(!file.exists(adj.expr.file)){
      adj.expr <- adjust(t(expr.mat[,expr.idx]), dummy.batch)
      saveRDS(adj.expr, adj.expr.file)
    }else{
      adj.expr <- readRDS(adj.expr.file)
    }

    #test one to get factor names
    df <- cbind("expr" = adj.expr[,1], covar.mat)
    diff.test <- lm(expr~(sex_ge+age_batch+climb_geno)^2, data = df)
    factor.names <- rownames(anova(diff.test))
    num.factors <- length(factor.names)-1

    #test all main effects and interactions
    effect.p <- matrix(NA, nrow = ncol(adj.expr), ncol = num.factors)
    colnames(effect.p) <- factor.names[1:num.factors]
    model.r2 <- rep(NA, ncol(adj.expr))
    for(i in 1:ncol(adj.expr)){
      df <- cbind("expr" = adj.expr[,i], covar.mat)
      diff.test <- lm(expr~(sex_ge+age_batch+climb_geno)^2, data = df)
      model.r2[i] <- summary(diff.test)$adj.r.squared
      effect.p[i,] <- anova(diff.test)$"Pr(>F)"[1:num.factors]
      #coef(diff.test)
    }
    rownames(effect.p) <- names(model.r2) <- colnames(adj.expr)
    #par(mar = c(12,4,4,2)); boxplot(-log10(effect.p), las = 2, ylab = "-log10(p)")
    #plot.decomp(t(-log10(effect.p)), label.points = TRUE)
    result <- list("p_values" = effect.p, "R2" = model.r2)
    saveRDS(result, full.model.file)
  }else{
    result <- readRDS(full.model.file)
  }
    return(result)
}

We first used a linear model to test for main effects of each covariate along with interactions between the covariates.

full.model.result <- test_full_model(expr.mat = scaled.expr, covar.mat = plotting.df)
effect.p <- full.model.result$p_values
model.r2 <- full.model.result$R2

#take out the genes with the largest sex effect. These are all X and Y genes.
sex.effect.threshold = 10
big.sex.effect <- which(-log10(effect.p[,"sex_ge"]) > sex.effect.threshold)
big.sex.genes <- gene.info[match(names(big.sex.effect), gene.info[,"ensembl_gene_id"]),]

effect.p <- effect.p[-big.sex.effect,]

we removed the genes with the sex effects greater than 10. There were 12 with effects this large. They are shown below and reside mostly on the sex chromosomes.

colnames(big.sex.genes) <- c("geneID", "geneName","entrezID", "Chr", "start", "end")
datatable(big.sex.genes)

The box plot below shows the distributions of the effects of each factor with the sex-specific genes removed.

par(mar = c(12,4,4,2))
boxplot(-log10(effect.p), las = 2, ylab = "-log10(p)")

The following are qq plots for each p value distribution. These are interesting. It looks as if almost every gene’s transcription is affected by Klotho genotype. There are many that are affected by age. There is a large effect of sex, but not as big as the age or genotype effect.

There really arent any genes with an interaction effect between sex and genotype. Same for the interaction between sex and genotype. This is unexpected.

There are many more genes that have a sex by age interaction.

effect.fdr  <- apply(effect.p, 2, function(x) p.adjust(x, "fdr"))
model.r2 <- model.r2[-big.sex.effect]

layout(get.layout.mat(ncol(effect.p)))
for(i in 1:ncol(effect.p)){
  qqunif.plot(effect.p[,i], plot.label = colnames(effect.p)[i])
}

The following bar plot shows the number of genes with significant effects due to each factor with an FDR less than 0.1.

Klotho genotype affected expression of the most genes, followed by age. Sex affected very few, which I think is consistent with previous results in the brain.

A handful of genes had a sex by age interaction, and two had a genotype by age interaction.

factor.fdr <- apply(effect.p, 2, function(x) p.adjust(x, "fdr"))
sig.factor <- apply(factor.fdr, 2, function(x) which(x <= fdr.thresh))
#sig.factor <- apply(effect.p, 2, function(x) which(x <= 0.01))
num.sig <- sapply(sig.factor, length)
par(mar = c(12, 4, 4, 2))
barplot_with_num(num.sig, las = 2, ylab = "Count")

saveRDS(num.sig, 
  here("Results", "for_paper", paste0("num_de_", basename(results.dir), ".RDS")))

The following heat map shows enrichments for genes that were nominally significant for each main effect or interaction.

enrich <- lapply(sig.factor, function(x) gost(names(x), organism = "mmusculus", sources = c("GO", "KEGG", "CORUM", "HP", "REACTOME")))
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
plot.enrichment.group(enrich, max.term.size = 3000, transformation = sqrt, max.char = 100)

Interaction plots for any genes with significant age by genotype interactions are printed to the results folder for this run in a pdf called age_geno_int.pdf.

adj.expr <- readRDS(file.path(processed.data.dir, "Batch_Adjusted_Expr.RDS"))

age.geno.col <- which(colnames(factor.fdr) == "age_batch:climb_geno")

if(length(age.geno.col) > 0){
  sig.idx <- which(factor.fdr[,age.geno.col] <= fdr.thresh)

  if(length(sig.idx) > 0){
    pdf(file.path(results.dir, "age_geno_int.pdf"), width = 7, height = 5)
      for(i in 1:length(sig.idx)){
        gene.id <- rownames(factor.fdr)[sig.idx[i]]
        model <- lm(adj.expr[,gene.id]~plotting.df[,"age_batch"]*plotting.df[,"climb_geno"])
        #summary(model)
        #anova(model)
        #gene.id <- sample(rownames(factor.fdr), 1)
        gene.name <- gene.info[which(gene.info[,"ensembl_gene_id"] == gene.id),"external_gene_name"]
        interaction.plot(plotting.df[,"age_batch"], plotting.df[,"climb_geno"], adj.expr[,gene.id], col = geno.cols, lwd = 3,
          xlab = "Age", ylab = "Expression", main = gene.name, trace.label = "genotype")
      }
    dev.off()
  }
}

Interaction plots for genes with significant sex by genotype interactions are printed to the results folder for this run in a pdf called sex_geno_int.pdf.

sex.geno.col <- which(colnames(factor.fdr) == "sex_ge:climb_geno")

if(length(sex.geno.col) > 0){
  sig.idx <- which(factor.fdr[,sex.geno.col] <= fdr.thresh)

  if(length(sig.idx) > 0){
    pdf(file.path(results.dir, "sex_geno_int.pdf"), width = 7, height = 5)
    for(i in 1:length(sig.idx)){
      gene.id <- rownames(factor.fdr)[sig.idx[i]]
      model <- lm(adj.expr[,gene.id]~plotting.df[,"sex_ge"]*plotting.df[,"climb_geno"])
      #summary(model)
      #anova(model)
      #gene.id <- sample(rownames(factor.fdr), 1)
      gene.name <- gene.info[which(gene.info[,"ensembl_gene_id"] == gene.id),"external_gene_name"]
      interaction.plot(plotting.df[,"sex_ge"], plotting.df[,"climb_geno"], adj.expr[,gene.id], col = geno.cols, lwd = 3,
        xlab = "Sex", ylab = "Expression", main = gene.name, trace.label = "genotype")
    }
    dev.off()
  }
}
## quartz_off_screen 
##                 2
de.files <- get.files(here("Results", "for_paper"), want = c("num_de", "RDS"), full.names = TRUE)
factor.labels <- c("sex_ge" = "sex", "climb_geno" = "Klotho genotype", "age_batch" = "age")

if(length(de.files) == 3){
  num.de <- lapply(de.files, readRDS)
  names(num.de) <- gsub(".RDS", "", gsub("num_de_", "", basename(de.files)))
  all.factors <- Reduce("union", lapply(num.de, names))

  count.table <- matrix(NA, ncol = length(all.factors), nrow = length(num.de))
  colnames(count.table) <- all.factors
  rownames(count.table) <- names(num.de)
  for(i in 1:length(num.de)){
    count.table[i,names(num.de[[i]])] <- num.de[[i]]
  }
  #for now, make group a column, so we can turn it into a LaTeX table more easily
  count.table <- cbind("group" = names(num.de), count.table)

  for(i in 1:length(factor.labels)){
    colnames(count.table) <- gsub(names(factor.labels)[i], factor.labels[i], colnames(count.table))
  }

  write.table(count.table, here("Results", "for_paper", "num_de_table.txt"), 
    row.names = FALSE, na = "--", quote = FALSE, sep = "\t")

  #cd to ~/Documents/git_repositories/LaTeX-Table-Generator
  #run python3 table_generator.py
  #move the result from Outputs to the for_paper directory
  #open the file and paste the table into the manuscript document
  #paste the result into LaTeXit to test if needed.
}

DEG by genotype

The heatmap below shows the mean expression of differentially expressed genes for each genotype. This is with an fdr value of 0.1.

sig.geno.idx <- which(effect.fdr[,"climb_geno"] <= fdr.thresh)

sig.expr <- scaled.expr[names(sig.geno.idx),]

#divide into groups based on carrier status
alleles <- genotypes[1:5]
allele.idx <- lapply(alleles, function(x) which(info[,"climb_geno"] == x))

diff.expr <- lapply(allele.idx, function(x) sig.expr[,x])
mean.sig.expr <- sapply(diff.expr, rowMeans)
colnames(mean.sig.expr) <- alleles

expr.clust <- pam(mean.sig.expr, k = 2)
col.order <- order(expr.clust$clustering)
annot.col <- data.frame(as.factor(expr.clust$clustering))
colnames(annot.col) <- "cluster"
pheatmap(t(mean.sig.expr[col.order,ordered.geno]), show_colnames = FALSE, cluster_cols = FALSE, 
  cluster_rows = FALSE, annotation_col = annot.col)

gene.names <- mouse.gene.table[match(rownames(annot.col), mouse.gene.table[,"ensembl_gene_id"]), "external_gene_name"]
write.table(cbind(gene.names, annot.col)[order(annot.col[,1]),], file.path(results.dir, "DEG_clusters.txt"), 
  sep = "\t", quote = FALSE)
if(subgroup[[1]] == 12){
  cluster.id <- expr.clust$clustering
  png(here("Results", "for_paper", "deg_clusters.png"), width = 7, height = 3,  units = "in", res = 300)
  scaled.mat <- t(apply(mean.sig.expr[col.order,ordered.geno], 1, scale))
  dimnames(scaled.mat) <- dimnames(mean.sig.expr[,ordered.geno])

  layout.mat <- matrix(c(1,2), nrow = 1)
  layout(layout.mat, widths = c(1, 0.2))
  par(mar = c(2,4,4,1), xpd = NA)
  imageWithText(t(scaled.mat), show.text = FALSE, use.pheatmap.colors = TRUE,
    col.names = NULL, row.text.shift = 0.07, row.text.adj = 0.5)
  plot.dim <- par("usr")
  plot.height <- plot.dim[4] - plot.dim[3]
  plot.width <- plot.dim[2] - plot.dim[1]
  c1.mid <- mean(which(cluster.id[col.order] == 1))
  c2.mid <- mean(which(cluster.id[col.order] == 2))
  label.y <- (plot.dim[4]+(plot.height*0.09))
  text(x = c1.mid, y = label.y, labels = "Neuron and Synapse")
  text(x = c2.mid, y = label.y, labels = "Mitochondria and Ribosome")

  #add colored labels for the genotypes
  y.vals <- 1:length(ordered.geno)
  y.shift <- plot.height*0.06
  label.xmin <- plot.dim[1] - (plot.width*0.09)
  label.xmax <- plot.dim[1] + (plot.width*0.03)
  for(i in 1:length(y.vals)){
    draw.rectangle(label.xmin, label.xmax, (y.vals[i] - y.shift), (y.vals[i] + y.shift),
      border = geno.cols[match(rev(ordered.geno)[i], names(geno.cols))], 
      lwd = 3)
  }

  #add scale bar
  par(mar = c(2,4,4,1))
  imageWithTextColorbar(mean.sig.expr, use.pheatmap.colors = TRUE, cex = 1, bar.lwd = 3)
  dev.off()
}
## quartz_off_screen 
##                 2

We clustered these genes into two groups and looked at enrichment. The enrichments of the groups are shown below.

group.genes <- lapply(1:2, function(x) rownames(annot.col)[which(annot.col[,1] == x)])
group.enrich = lapply(group.genes, function(x) gost(x, organism = "mmusculus",
  sources = c("GO", "KEGG", "REACOME", "CORUM", "HP", "WP")))
names(group.enrich) <- c(1,2)
plot.enrichment.group(group.enrich, max.term.size = 3000, n.terms = 15)

if(subgroup[[1]] == 12){
  pdf(here("Results", "for_paper", "Enrichment.pdf"), width = 8, height = 7)
  for(i in 1:length(group.enrich)){
    plot.enrichment(group.enrich[[i]], plot.label = paste("Cluster", i), 
      max.term.size = 3000, num.terms = 15)
  }
  dev.off()
}
## quartz_off_screen 
##                 2

The following correlation matrix heat map shows that the VS homo- and heterozygote effects are correlated with each other, as are the FC homo- and heterozygote effects. The VS and FC effects are negatively correlated with each other, meaning that when the FC genotype has increased expression of a gene, the VS genotype tends to have decreased expression of that gene.

The FC genotypes are more correlated with each other than the VS genotypes suggesting that the heterozygote and homozygote of the VS alleles have different effects more often than the homo- and heterozygote of the FC alleles.

The WT mice are also more correlated with the FC than with the VS animals.

pheatmap(cor(mean.sig.expr), display_numbers = TRUE)

The following table shows functional enrichments for the group of genes that are differentially expressed across Klotho genotypes.

sig.enrich <- gost(rownames(mean.sig.expr), organism = "mmusculus", 
  sources = c("GO", "KEGG", "REACTOME", "CORUM", "HP", "HPA"))
plot.enrichment(sig.enrich, plot.label = "DEG by Klotho genotype", num.terms = 30,
  max.term.size = 3000)

The following plot is a wordcloud version of the above data.

#pdf("~/Desktop/overall_enrich.pdf", width = 12, height = 7)
par(mfrow = c(1,2), mar = c(0,0,0,0))
plot.enrichment.wordcloud(sig.enrich, plot.label = "DEG by Klotho genotype", num.terms = 20,
  max.term.size = 3000, just.wordcloud = FALSE)

#dev.off()

The following plot shows that compared with WT mice, the FC and VS alleles have opposing effects on many genes.

vs.idx <- grep("VS", colnames(mean.sig.expr))
vs.mean <- rowMeans(mean.sig.expr[,vs.idx])

fc.idx <- grep("FC", colnames(mean.sig.expr))
fc.mean <- rowMeans(mean.sig.expr[,fc.idx])

plot(vs.mean, fc.mean, xlab = "VS mean expression", 
  ylab = "FC mean expression")
abline(h = 0, v = 0)

The following plot shows the enrichments for the genes in each quadrant above.

quads <- pair.matrix(c("up", "down"), TRUE, TRUE)
enrich.idx <- vector(mode = "list", length = nrow(quads))
for(i in 1:nrow(quads)){
  if(quads[i,1] == "up"){
    x.idx <- which(vs.mean > 0)
  }else{
    x.idx <- which(vs.mean < 0)
  }

  if(quads[i,2] == "up"){
    y.idx <- which(fc.mean > 0)
  }else{
    y.idx <- which(fc.mean < 0)
  }
  enrich.idx[[i]] <- intersect(x.idx, y.idx)
}

quad.enrich <- lapply(enrich.idx, function(x) gost(names(vs.mean)[x], 
  organism = "mmusculus", sources = c("GO", "KEGG", "REACTOME", "CORUM", "HP")))
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
#pdf("~/Desktop/enrichment.pdf", width = 10, height = 10)
layout.matrix <- matrix(c(4,1,3,2), nrow = 2, byrow = TRUE)
layout(layout.matrix)
par(mar = c(4,0,4,0))
for(i in 1:length(quad.enrich)){
  if(i == 3){max.cex = 2}else{max.cex = 2.5}
  plot.enrichment.wordcloud(quad.enrich[[i]], max.term.size = 3000,
  plot.label = "", just.wordcloud = TRUE, max.cex = max.cex, num.terms = 30)
  mtext(paste0("VS ", quads[i,1], "; FC ", quads[i,2]), side = 3, font = 2)
  plot.dim <- par("usr")
  draw.rectangle(plot.dim[1],plot.dim[2],plot.dim[3],plot.dim[4])
}

#dev.off()

AD Gene expression

We are interested in how Klotho genotype affects expression of genes that have previously been associated with Alzheimer’s disease. We downloaded a list of AD-associated genes from Agora: https://agora.adknowledgeportal.org/genes/nominated-targets This is their list of nominated targets.

We can separate the genes by evidence, e.g. gene expression, protein levels, genetic evidence, etc. Here we look at all AD genes.

#ad.gene.list <- read.delim(here("Data", "Human", "AD_genes.txt"), header = FALSE) #from book
ad.gene.list <- read.csv(here("Data", "Human", "gene-list.csv"), comment.char = "#")

#we can filter by a number of features:
#filter to those with RNA evidence
#has.rna <- grep("RNA", ad.gene.list[,"Input.Data"])
#ad.gene.list <- ad.gene.list[has.rna,]

#pick genes with more than one nomination
#multiple.noms <- which(ad.gene.list[,"Nominations"] > 1)
#ad.gene.list <- ad.gene.list[multiple.noms,]

common.ad <- intersect(ad.gene.list[,1], orthos[,"Human.Gene.Name"])
ad.orthos <- orthos[match(common.ad, orthos[,"Human.Gene.Name"]),"Mouse.Ortholog.Name"]
ad.ensembl <- intersect(gene.info[match(intersect(ad.orthos, gene.info[,"external_gene_name"]), 
  gene.info[,"external_gene_name"]), "ensembl_gene_id"], rownames(effect.fdr))
ad.sig <- intersect(rownames(mean.sig.expr), ad.ensembl)

The following plot shows how the differentially expressed genes and AD-associated genes interact with each other.

sig.list <- list("All genes" = rownames(effect.fdr), "DEG" = rownames(mean.sig.expr),
  "AD Genes" = ad.ensembl)
#plotVenn(Vlist = sig.list)

circle.cols <- c("all" = "black", "deg" = "#bf812d", "ad" = "#756bb1")

ind.num <- sapply(sig.list, length)
deg.only <- length(setdiff(sig.list$DEG, sig.list$"AD Genes"))
ad.only <- length(setdiff(sig.list$"AD Genes", sig.list$DEG))
deg.and.ad <- length(intersect(sig.list$"AD Genes", sig.list$DEG))

par(mar = c(0,0,0,0))
plot.new()
plot.window(xlim = c(0, 1), ylim = c(0,1))

inner.center <- 0.45
inner.label.y <- 0.62
deg.center <- 0.4 
ad.center <- 0.7
label.cex <- 1.2

#all.circle <- get_circle(radius = 0.45, center_x = 0.5, center_y = inner.center)
#points(all.circle$x, all.circle$y, col = circle.cols["all"], type = "l", lwd = 3)
draw.rectangle(min.x = 0.12, max.x = 0.92, min.y = 0.15, max.y = 0.85, border = circle.cols["all"], lwd = 3)
deg.circle <- get_circle(radius = 0.25, center_x = 0.4, center_y = inner.center)
points(deg.circle$x, deg.circle$y, col = circle.cols["deg"], type = "l", lwd = 3)
ad.circle <- get_circle(radius = 0.15, center_x = 0.7, center_y = inner.center)
points(ad.circle$x, ad.circle$y, col = circle.cols["ad"], type = "l", lwd = 3)


text(x = 0.52, y = 0.81, labels = paste0("All (", ind.num["All genes"], ")"), 
  col = circle.cols["all"], font = 2, cex = label.cex)
text(x = deg.center, y = inner.label.y, labels = paste0("DEG (", ind.num["DEG"], ")"), 
  col = circle.cols["deg"], font = 2, cex = label.cex)
text(x = ad.center+0.035, y = inner.label.y, labels = paste0("AD (", ind.num["AD Genes"], ")"), 
  col = circle.cols["ad"], font = 2, cex = label.cex)

text(x = deg.center, y = inner.center, labels = deg.only, font = 2, cex = label.cex, col = circle.cols["deg"])
text(x = ad.center+0.02, y = inner.center, labels = ad.only, font = 2, cex = label.cex, col = circle.cols["ad"])
text(x = 0.6, y = inner.center, labels = deg.and.ad, font = 2, cex = label.cex, col = mix_colors(circle.cols["deg"], circle.cols["ad"]))

AD gene overview

Are AD genes enriched among the differentially expressed genes? Are there more AD genes among the DEG than we would expect by chance? We used the hypergeometric distribution to answer this.

We have an urn with \(N\) total marbles. Some of these marbles (\(K\) red marbles) are differentially expressed. \(N-K\) green marbles are not differentially expressed. If we draw \(n\) AD genes, what is the probability that exaclty \(k\) of them are red (i.e. differentially expressed)?

. We want to know the probability of drawing \(k\)

#quartz(width = 5, height = 5)
plot.new()
par(mar = c(1,1,1,1))
plot.window(xlim = c(0, 1), ylim = c(0, 1))
draw.rectangle(0.1, 0.9, 0.1, 0.9)

gene.circle <- get_circle(radius = 0.25, center_x = 0.4, center_y = 0.5)
lower.idx <- which(gene.circle$y <= 0.5)
upper.idx <- which(gene.circle$y >= 0.5)
plot.poly.xy(gene.circle$x[upper.idx], gene.circle$y[upper.idx], 
  gene.circle$x[lower.idx], gene.circle$y[lower.idx], border = NA,
  col = rgb(190/256,174/256,212/256))
points(gene.circle$x, gene.circle$y, type = "l")

ad.circle <- get_circle(radius = 0.25, center_x = 0.6, center_y = 0.5)
lower.idx <- which(ad.circle$y <= 0.5)
upper.idx <- which(ad.circle$y >= 0.5)
plot.poly.xy(ad.circle$x[upper.idx], ad.circle$y[upper.idx], 
  ad.circle$x[lower.idx], ad.circle$y[lower.idx], border = NA,
  col = rgb(127/256,201/256,127/256, alpha = 0.5))
points(ad.circle$x, ad.circle$y, type = "l")

#add labels
text(x = 0.72, y = 0.12, labels = "All genes = N", adj = 0)
text(x = 0.25, y = 0.76, labels = "DEG = K")
text(x = 0.83, y = 0.76, labels = "AD genes = n", adj = 1)
text(x = 0.5, y = 0.5, labels = "AD and DEG = k")
mtext("What is the probability that in our draw of AD genes\n
  we get k that are differentially expressed?",
  side = 1, line = -2.5, padj = 0)

#from: https://montilab.github.io/BS831/articles/docs/HyperEnrichment.html

all.gene.id <- gsub(".value", "", rownames(full.model.result$p_values))
all.gene.name <- gene.info[match(all.gene.id, gene.info[,"ensembl_gene_id"]), "external_gene_name"]
sig.deg <- rownames(sig.expr)
#ad.orthos

p.of.draw <- phyper(q = length(ad.sig)-1, #number of red marbles in our draw (number of AD genes that are DEG)
  m = length(sig.deg), #number of red marbles (DEG) in the urn
  n = length(all.gene.id) - length(sig.deg), #number of green marbles (not DEG)
  k = length(ad.orthos), #number of drawn marbles (AD genes)
  lower.tail = FALSE #compute P(X > 1 overlap), hence the -1 above.
)

The calculated probability of there 260 differentially expressed AD genes without enrichment in the set of all DEGs is 2^{-19}, which we correct to 2.2^{-16}.

if(subgroup[[1]] == 12){
  
cluster.id <- expr.clust$clustering
  png(here("Results", "for_paper", "deg_clusters.png"), width = 7, height = 3.5,  units = "in", res = 300)
  scaled.mat <- t(apply(mean.sig.expr[col.order,ordered.geno], 1, scale))
  dimnames(scaled.mat) <- dimnames(mean.sig.expr[,ordered.geno])

  layout.mat <- matrix(c(1,2), nrow = 1)
  layout(layout.mat, widths = c(1, 0.2))
  par(mar = c(2,4,4,1), xpd = NA)
  imageWithText(t(scaled.mat), show.text = FALSE, use.pheatmap.colors = TRUE,
    col.names = NULL, row.text.shift = 0.07, row.text.adj = 0.5)
  plot.dim <- par("usr")
  plot.height <- plot.dim[4] - plot.dim[3]
  plot.width <- plot.dim[2] - plot.dim[1]
  c1.idx <- which(cluster.id[col.order] == 1)
  c2.idx <- which(cluster.id[col.order] == 2)
  c1.mid <- mean(c1.idx)
  c2.mid <- mean(c2.idx)
  label.y <- (plot.dim[4]+(plot.height*0.09))
  text(x = c1.mid, y = label.y, labels = "Mitochondria and Ribosome")
  text(x = c2.mid, y = label.y, labels = "Neuron and Synapse")

  #add colored labels for the genotypes
  y.vals <- 1:length(ordered.geno)
  y.shift <- plot.height*0.06
  label.xmin <- plot.dim[1] - (plot.width*0.09)
  label.xmax <- plot.dim[1] + (plot.width*0.03)
  for(i in 1:length(y.vals)){
    draw.rectangle(label.xmin, label.xmax, (y.vals[i] - y.shift), (y.vals[i] + y.shift),
      border = geno.cols[match(rev(ordered.geno)[i], names(geno.cols))], 
      lwd = 3)
  }

  #add markes for AD genes
  ad.id <- gene.info[match(ad.orthos, gene.info[,"external_gene_name"]), "ensembl_gene_id"]
  to.mark <- intersect(ad.id[which(!is.na(ad.id))], rownames(scaled.mat))
  ad.idx <- match(to.mark, rownames(scaled.mat))
  text(x = ad.idx, y = 0.35, labels = "|", col = circle.cols["ad"])
  text(x = -65, y = 0.35, labels = "AD gene", adj = 1)

  c1.ad.perc <- round(length(intersect(ad.idx, c1.idx))/length(c1.idx)*100)
  c2.ad.perc <- round(length(intersect(ad.idx, c2.idx))/length(c2.idx)*100)
  text(x = c1.mid, y = 0, labels = paste0(c1.ad.perc, "% AD"))
  text(x = c2.mid, y = 0, labels = paste0(c2.ad.perc, "% AD"))

  #add scale bar
  par(mar = c(2,4,4,1))
  imageWithTextColorbar(mean.sig.expr, use.pheatmap.colors = TRUE, cex = 1, bar.lwd = 3)
  dev.off()
}
## quartz_off_screen 
##                 2
sig.info <- gene.info[match(rownames(mean.sig.expr), gene.info[,"ensembl_gene_id"]),]
expr.table <- mean.sig.expr
colnames(expr.table)  <- paste0(colnames(expr.table), "_mean_expression")
is.agora <- rep("no", nrow(sig.info))
is.agora[which(sig.info[,"external_gene_name"] %in% ad.orthos)] <- "yes"
supp.table <- cbind(sig.info, expr.table, "Agora_target" = is.agora, 
  "Cluster_ID" = cluster.id[rownames(mean.sig.expr)])
write.table(supp.table, here("Results", "for_paper", "Supp_Table_DEG.csv"),
  sep = ",", quote = FALSE, row.names = FALSE)

Individual AD genes

There were multiple individual genes of interest in this list:

APOE was upregulated in the VS animals reltaive to WT and FC animals. APOE mRNA has been shown to be more abundant in AD brains (PMID: 7751846, 27104063), so this possibly contradicts the protective effect of the VS allele.

Celf1 is also low in the VS animals relative to the FC animals. It has been previously shown that CELF1 expression is low in AD brains (mentioned in PMID: 38768546), so this also possibly contradicts the protective effect of the VS allele.

Trem2 is downregulated in the FC animals relative to the others. This could be protective since Trem2 is more highly expressed in AD brains and may be associated with increased macrophage recruitment (PMID: 33516818).

So we see a mixed bag with expression of individual genes. Thus far, the VS mice do not look as if their brains have less AD-like expression.

ad.interest <- c("App", "Apoe", "Celf1", "Trem2")

for(i in 1:length(ad.interest)){
  cat("###", ad.interest[i], "\n")
  gene.id <- gene.info[which(gene.info[,"external_gene_name" ]== ad.interest[i]), "ensembl_gene_id"]
  test <- plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat, 
    tx_name = gene.id, tx_label = ad.interest[i], ylab = "Expression (A.U.)",
    order.by.mean = FALSE, plot.results = TRUE)
  cat("\n\n")
}

App

Apoe

Celf1

Trem2

if(subgroup[[1]] == 12){
  pdf(here("Results", "for_paper", "AD-related_expression.pdf"), width = 7, height = 7)
  for(i in 1:length(ad.interest)){
    gene.id <- gene.info[which(gene.info[,"external_gene_name" ]== ad.interest[i]), "ensembl_gene_id"]
    test <- plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat, 
      tx_name = gene.id, tx_label = ad.interest[i], ylab = "Expression (A.U.)",
      order.by.mean = FALSE, plot.results = TRUE, cex.labels = 1.5)
  }
dev.off()
}
## quartz_off_screen 
##                 2

Biodomains

We also looked at differential expression based on the biodomains described in PMID: 38650747. The lists are stored at syn25760039. The synIDs for the individual files are listed in Synapse_IDs_biodomain_gene_lists.csv.

There is also a table stored at syn25428992, that annotates the GO terms. Each row in this table is a GO term. The information in each row tells you which biodomain and which subdomain the GO term is included in. This is how Greg makes those boxplots in which each dot is a GO term, and the position is the normalized enrichment score from GSEA. The dots are grouped by domain. I am not going to worry about the subdomains right now, but it will be handy to have the lists of genes in each GO term.

For now, I will make a list of genes in each domain, translate those to mouse orthologs and test for enrichment among differentially expressed genes.

biodomain.dir <- here("Data", "Biodomains")
if(!file.exists(biodomain.dir)){dir.create(biodomain.dir)}

#make a list of the genes in each biodomain
file.dest <- file.path(biodomain.dir, "annotated_biodomains_Oct23.rds")
if(!file.exists(file.dest)){
  synGet(syn25428992, downloadLocation = biodomain.dir)
}

annotated_bd <- readRDS(file.path(biodomain.dir, "annotated_biodomains_Oct23.rds"))
u_bd <- unique(annotated_bd$Biodomain)

bd_genes <- vector(mode = "list", length = length(u_bd))
names(bd_genes) <- u_bd
for(bd in 1:length(u_bd)){
  bd.idx <- which(annotated_bd$Biodomain == u_bd[bd])  
  bd_genes[[bd]] <- Reduce("union", annotated_bd$ensembl_id[bd.idx])
}
saveRDS(bd_genes, file.path(general.processed.data.dir, "Human_Biodomains_for_GSEA.RDS"))


#also gather subdomains in a nested list
sbd_genes <- vector(mode = "list", length = length(u_bd))
names(sbd_genes) <- u_bd
for(bd in 1:length(u_bd)){
  bd.idx <- which(annotated_bd$Biodomain == u_bd[bd])  
  sbd.names <- unique(annotated_bd$Subdomain[bd.idx])
  sbd.names <- sbd.names[which(!is.na(sbd.names))]
  if(length(sbd.names) > 0){
    sbd_list <- vector(mode = "list", length = length(sbd.names))
    names(sbd_list) <- sbd.names
    for(sbd in 1:length(sbd.names)){
      sbd.idx <- which(annotated_bd$Subdomain == sbd.names[sbd])  
      sbd_list[[sbd]] <- Reduce("union", annotated_bd$ensembl_id[sbd.idx])
    }
    sbd_genes[[bd]] <- sbd_list
  }
}
saveRDS(sbd_genes, file.path(general.processed.data.dir, "Human_Subdomains_for_GSEA.RDS"))

The biodomains are in terms of human genes. We need to translate these to mouse orthologs.

mouse.bd.genes <- vector(mode = "list", length = length(bd_genes))
names(mouse.bd.genes) <- names(bd_genes)

for(i in 1:length(bd_genes)){
  bd.with.ortho <- intersect(bd_genes[[i]], orthos[,"Human.Ensembl"])
  bd.idx <- match(bd.with.ortho, orthos[,"Human.Ensembl"])
  mouse.bd.genes[[i]] <- orthos[bd.idx,c("Mouse.Ortholog.Ensembl", "Mouse.Ortholog.Name")]
}

bd.for.gsea <- lapply(mouse.bd.genes, function(x) x[,1])
saveRDS(bd.for.gsea, file.path(general.processed.data.dir, "Mouse_Biodomains_for_GSEA.RDS"))


#do the same for subdomains
mouse.sbd.genes <- vector(mode = "list", length = length(sbd_genes))
names(mouse.sbd.genes) <- names(sbd_genes)

for(i in 1:length(sbd_genes)){
  if(length(sbd_genes[[i]]) > 0){
    sbd.list <- vector(mode = "list", length = length(sbd_genes[[i]]))
    names(sbd.list) <- names(sbd_genes[[i]])
    for(s in 1:length(sbd.list)){
      sbd.with.ortho <- intersect(sbd_genes[[i]][[s]], orthos[,"Human.Ensembl"])
      sbd.idx <- match(sbd.with.ortho, orthos[,"Human.Ensembl"])
      sbd.list[[s]] <- orthos[sbd.idx,c("Mouse.Ortholog.Ensembl", "Mouse.Ortholog.Name")]
    }
    mouse.sbd.genes[[i]] <- sbd.list
  }
}

sbd.for.gsea <- lapply(mouse.sbd.genes, function(x) lapply(x, function(y) y[,1]))
saveRDS(sbd.for.gsea, file.path(general.processed.data.dir, "Mouse_Subdomains_for_GSEA.RDS"))
#also save gene lists for all GO terms in each biodomain
#This is how Greg Cary makes his enrichment boxplots. He
#gets a normalized enrichment score for each GO term and plots
#all those together for each biodomain
#take any GO term with at least 3 genes
mouse_go_list <- human_go_list <- vector(mode = "list", length = length(bd_genes))
names(mouse_go_list) <- names(human_go_list) <- names(bd_genes)
for(bd in 1:length(bd_genes)){
  bd.idx <- which(annotated_bd$Biodomain == names(bd_genes)[bd])
  bd_go <- annotated_bd$GOterm_Name[bd.idx]
  go_genes <- annotated_bd$ensembl_id[bd.idx]
  names(go_genes) <- bd_go
  num.genes <- sapply(go_genes, length)
  human_go_genes <- go_genes[which(num.genes >= min.term.size)] 
  dups <- which(duplicated(names(human_go_genes)))
  dup.idx <- lapply(dups, function(x) which(names(human_go_genes) == names(human_go_genes)[x]))
  keep.idx <- unique(sapply(dup.idx, function(x) x[1]))
  delete.idx <- setdiff(unique(unlist(dup.idx)), keep.idx)
  no.dups <- setdiff(1:length(human_go_genes), delete.idx)
  human_go_list[[bd]] <- human_go_genes[no.dups]
    
  common_go_genes <- lapply(human_go_genes[no.dups], function(x) intersect(x, orthos[,"Human.Ensembl"]))
  mouse_go_genes <- lapply(common_go_genes, function(x) orthos[match(x, orthos[,"Human.Ensembl"]),"Mouse.Ortholog.Ensembl"])
  num.mouse <- sapply(mouse_go_genes, length)
  mouse_go_list[[bd]] <- mouse_go_genes[which(num.mouse >= min.term.size)]
}
saveRDS(mouse_go_list, file.path(general.processed.data.dir, "Mouse_Biodomains_sub_GO_for_GSEA.RDS"))
saveRDS(human_go_list, file.path(general.processed.data.dir, "Human_Biodomains_sub_GO_for_GSEA.RDS"))

Biodomain Expression

The following heat maps show the mean expression of genes across each biodomain. Each has been clustered by medoid clustering into the best number of clusters (always 2). The two variants seem to have opposing expression relative to the WT. This seems like an artefactual pattern. I’m not sure why it is so consistent.

mean.bd.vals <- vector(mode = "list", length = length(mouse.bd.genes))
names(mean.bd.vals) <- names(mouse.bd.genes)

for(bd in 1:length(mouse.bd.genes)){
  bd.vals <- lapply(1:nrow(mouse.bd.genes[[bd]]), 
    function(x) plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat, 
      tx_name = mouse.bd.genes[[bd]][x,1], 
      tx_label = mouse.bd.genes[[bd]][x,2], 
      ylab = "Exression (A.U.)", plot.results = FALSE)[[1]])

  has.vals <- which(sapply(bd.vals, length) > 0)
  bd.mean <- t(sapply(bd.vals[has.vals], function(x) sapply(x[[1]], mean)))
  rownames(bd.mean) <- sapply(bd.vals[has.vals], names)
  mean.bd.vals[[bd]] <- bd.mean
}
#In previous runs I tried 2 to 10 clusters. All ended up with 2,
#so I am just setting k = 2 here to save time, but leaving the 
#code for testing for future work. 

clustered.gene.file <- file.path(results.dir, "Clustered_Coef.RDS")
clustered.enrich.file <- file.path(results.dir, "Clustered_Enrich.RDS")

if(!file.exists(clustered.gene.file)){
  #k.seq = 2:4
  clustered.genes <- gene.enrich <- vector(mode = "list", length = length(mean.bd.vals))
  names(clustered.genes) <- names(gene.enrich) <- names(mean.bd.vals)

  for(i in 1:length(mean.bd.vals)){
    #test.k <- test.pam.k(mean.bd.vals[[i]], plot.results = FALSE, kseq = k.seq)
    #mean.cl.width <- sapply(test.k$cl.width, mean)
    #barplot(mean.cl.width)
    #select the number of clusters with the best separation among clusters.
    #k = as.numeric(names(mean.cl.width)[which.max(mean.cl.width)])
    k = 2
    final.clust <- pam(mean.bd.vals[[i]], k = k)
    clustered.genes[[i]] <- final.clust$clustering
    cluster.enrich <- lapply(1:k, function(x) gost(names(clustered.genes[[i]])[which(clustered.genes[[i]] == x)], 
      organism = "mmusculus", source = c("GO", "KEGG", "REACTOME", "HP", "CORUM")))
    names(cluster.enrich) <- paste0("cluster", 1:k)
    gene.enrich[[i]] <- cluster.enrich
      #plot.enrichment.group(cluster.enrich, max.term.size = 3000, n.terms = 30, cluster_cols = FALSE, plot.label = names(mean.bd.vals)[i])
  }
  saveRDS(clustered.genes, clustered.gene.file)
  saveRDS(gene.enrich, clustered.enrich.file)
}else{
  clustered.genes <- readRDS(clustered.gene.file)
  gene.enrich <- readRDS(clustered.enrich.file)
}

bd.means <- matrix(nrow = length(mean.bd.vals), ncol = 5)
rownames(bd.means) <- names(mean.bd.vals)
colnames(bd.means) <- genotypes[1:5]

#pdf("~/Desktop/test.pdf", width = 8, height = 4)
for(i in 1:length(mean.bd.vals)){
  cat("###", names(mean.bd.vals)[i], "\n")
  #pheatmap(t(mean.bd.vals[[i]][names(clustered.genes[[i]])[order(clustered.genes[[i]])],]), 
  #  cluster_rows = FALSE, cluster_cols = FALSE, show_colnames = FALSE, main = names(mean.bd.vals)[i])
  par(mfrow = c(1,2))
  par(mar = c(4,4,4,0))
  clustered.mat <- t(mean.bd.vals[[i]][names(clustered.genes[[i]])[order(clustered.genes[[i]])],])
  cluster.means <- rowMeans(clustered.mat)

  mean.order <- order(cluster.means)
  bd.means[i,] <- cluster.means
  imageWithText(clustered.mat[mean.order,], 
    show.text = FALSE, use.pheatmap.colors = TRUE, col.names = NULL)
  par(mar = c(4,4,4,2))
  #p values seem to depend only on how many genes are in the group
  diff.test <- aov.by.matrix(t(clustered.mat))
  diff.p <- summary(diff.test)[[1]]$"Pr(>F)"[1]
  vioplot(t(clustered.mat[rev(mean.order),]),
    main = "", horizontal = TRUE, 
    las = 2, col = geno.cols[rownames(clustered.mat)[rev(mean.order)]])
  segments(x0 = 0, y0 = 0, y1 = 5.7)
  mtext(names(mean.bd.vals)[i], side = 3, outer = TRUE, font = 2, line = -2.5)
  cat("\n\n")  
}

Apoptosis

APP Metabolism

Autophagy

Cell Cycle

DNA Repair

Endolysosome

Epigenetic

Immune Response

Lipid Metabolism

Metal Binding and Homeostasis

Mitochondrial Metabolism

Myelination

Oxidative Stress

Proteostasis

RNA Spliceosome

Structural Stabilization

Synapse

Tau Homeostasis

Vasculature

#dev.off()

Biodomain Expression Summary

The following plot shows a summary of the mean expression of the biodomains in each genotype.

bd.mean.decomp <- plot.decomp(t(bd.means), plot.results = FALSE)
row.order <- order(bd.mean.decomp$v[,1])

layout.mat <- matrix(c(1,2), nrow = 1)
layout(layout.mat, widths = c(1, 0.5))
par(mar = c(4,18, 2,1))
imageWithText(signif(bd.means[row.order,], 2), split.at.vals = TRUE, 
  col.scale = c("blue", "brown"), grad.dir = "ends", row.text.shift = 0.15,
  col.text.shift = 0.03)
par(mar = c(4,1,2,2))
barplot(bd.mean.decomp$v[row.order,1], horiz = TRUE, xlab = "PC1")

Subdomains

We did the same analysis above for the subdomains.

The following heat maps show the mean expression of genes across each subdomain. Each has been clustered by medoid clustering.

sbd.mean.file <- file.path(results.dir, "Subdomain_Mean_Expr.RDS")
if(!file.exists(sbd.mean.file)){
  mean.sbd.vals <- vector(mode = "list", length = length(mouse.sbd.genes))
  names(mean.sbd.vals) <- names(mouse.sbd.genes)

  for(bd in 1:length(mouse.sbd.genes)){
    if(length(mouse.sbd.genes[[bd]]) > 0){
      sub.means <- vector(mode = "list", length = length(mouse.sbd.genes[[bd]]))
      names(sub.means) <- names(mouse.sbd.genes[[bd]])
      for(s in 1:length(mouse.sbd.genes[[bd]])){
        sbd <- mouse.sbd.genes[[bd]][[s]]
        sbd.vals <- lapply(1:nrow(sbd), 
          function(x) plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat, 
            tx_name = sbd[x,1], tx_label = sbd[x,2], 
            ylab = "Expression (A.U.)", plot.results = FALSE)[[1]])

        has.vals <- which(sapply(sbd.vals, length) > 0)
        sbd.mean <- t(sapply(sbd.vals[has.vals], function(x) sapply(x[[1]], mean)))
        rownames(sbd.mean) <- sapply(sbd.vals[has.vals], names)
        sub.means[[s]] <- sbd.mean
      }
    }
    mean.sbd.vals[[bd]] <- sub.means
  }
  saveRDS(mean.sbd.vals, sbd.mean.file)
}else{
  mean.sbd.vals <- readRDS(sbd.mean.file)
}
clustered.sd.file <- file.path(results.dir, "Clustered_Coef_SBD.RDS")
#clustered.sd.enrich.file <- file.path(results.dir, "Clustered_Enrich_SBD.RDS")

if(!file.exists(clustered.sd.file)){
  clustered.sd.genes <- vector(mode = "list", length = length(mean.sbd.vals))
  names(clustered.sd.genes) <- names(mean.sbd.vals)

  for(bd in 1:length(mean.sbd.vals)){

      if(length(mean.sbd.vals[[bd]]) > 0){
          clustered.sd <- vector(mode = "list", length = length(mean.sbd.vals[[bd]]))
          names(clustered.sd) <- names(mean.sbd.vals[[bd]])
          for(s in 1:length(mean.sbd.vals[[bd]])){
            k = 2
            if(length(mean.sbd.vals[[bd]][[s]]) > 0){
              final.clust <- pam(mean.sbd.vals[[bd]][[s]], k = k)
              clustered.sd[[s]] <- final.clust$clustering
              #cluster.enrich <- lapply(1:k, function(x) gost(names(clustered.genes[[i]])[which(clustered.genes[[i]] == x)], 
              #  organism = "mmusculus", source = c("GO", "KEGG", "REACTOME", "HP", "CORUM")))
              #names(cluster.enrich) <- paste0("cluster", 1:k)
              #gene.enrich[[i]] <- cluster.enrich
              #plot.enrichment.group(cluster.enrich, max.term.size = 3000, n.terms = 30, cluster_cols = FALSE, plot.label = names(mean.bd.vals)[i])
            }
          }
      clustered.sd.genes[[bd]] <- clustered.sd
      }
    }
    saveRDS(clustered.sd.genes, clustered.sd.file)
    #saveRDS(gene.enrich, clustered.enrich.file)
}else{
  clustered.sd.genes <- readRDS(clustered.sd.file)
  #gene.enrich <- readRDS(clustered.enrich.file)
}
num.sbd <- sum(sapply(mean.sbd.vals, length))
sbd.means <- matrix(nrow = num.sbd, ncol = 5)
sbd.names <- vector(mode = "list", length = length(mean.sbd.vals))
for(i in 1:length(mean.sbd.vals)){
  sbd.names[[i]] <- paste(names(mean.sbd.vals)[i], names(mean.sbd.vals[[i]]), sep = " : ")
}
rownames(sbd.means) <- unlist(sbd.names)
colnames(sbd.means) <- genotypes[1:5]

#pdf("~/Desktop/test_sd.pdf", width = 8, height = 4)
idx <- 1
for(bd in 1:length(mean.sbd.vals)){
  cat("###", names(mean.sbd.vals[bd]), "{.tabset .tabset-fade .tabset-pills}\n")
  if(length(mean.sbd.vals[[bd]]) > 0){
    for(s in 1:length(mean.sbd.vals[[bd]])){
      cat("####", names(mean.sbd.vals[[bd]][s]), "\n")
      #pheatmap(t(mean.bd.vals[[i]][names(clustered.genes[[i]])[order(clustered.genes[[i]])],]), 
      #  cluster_rows = FALSE, cluster_cols = FALSE, show_colnames = FALSE, main = names(mean.bd.vals)[i])
      par(mfrow = c(1,2))
      par(mar = c(4,4,4,0))
      clustered.mat <- t(mean.sbd.vals[[bd]][[s]][names(clustered.sd.genes[[bd]][[s]])[order(clustered.sd.genes[[bd]][[s]])],])
      cluster.means <- rowMeans(clustered.mat)

      mean.order <- order(cluster.means)
      sbd.means[idx,] <- cluster.means
      imageWithText(clustered.mat[mean.order,], 
        show.text = FALSE, use.pheatmap.colors = TRUE, col.names = NULL)
      par(mar = c(4,4,4,2))
      #p values seem to depend only on how many genes are in the group
      diff.test <- aov.by.matrix(t(clustered.mat))
      diff.p <- summary(diff.test)[[1]]$"Pr(>F)"[1]
      vioplot(t(clustered.mat[rev(mean.order),]),
        main = "", horizontal = TRUE, 
        las = 2, col = geno.cols[rownames(clustered.mat)[rev(mean.order)]])
      segments(x0 = 0, y0 = 0, y1 = 5.7)
      mtext(rownames(sbd.means)[idx], side = 3, outer = TRUE, font = 2, line = -2.5)
      cat("\n\n")  
      idx <- idx + 1
      cat("\n\n")
    }
  }
  cat("\n\n")
}

Apoptosis

intrinsic apoptotic signaling pathway

extrinsic apoptotic signaling pathway

regulation of oxidative stress-induced intrinsic apoptotic signaling pathway

neuron apoptotic process

apoptotic mitochondrial changes

canonical NF-kappaB signal transduction

inflammatory cell apoptotic process

regulation of cysteine-type endopeptidase activity involved in apoptotic process

programmed necrotic cell death

apoptotic process involved in development

glial cell apoptotic process

APP Metabolism

amyloid-beta clearance

amyloid precursor protein metabolic process

amyloid precursor protein biosynthetic process

amyloid-beta formation

Autophagy

macroautophagy

phagocytosis

mitophagy

Cell Cycle

cell cycle phase transition

DNA replication

microtubule cytoskeleton organization involved in mitosis

DNA Repair

global genome nucleotide-excision repair

DNA damage response

DNA repair complex

Endolysosome

clathrin-coated vesicle

receptor-mediated endocytosis

endocytic vesicle

early endosome

late endosome

lysosome

recycling endosome

endosomal transport

receptor internalization

exocytic vesicle

Epigenetic

histone modification

miRNA-mediated post-transcriptional gene silencing

DNA-binding transcription factor binding

Immune Response

activation of innate immune response

adaptive immune response

cytokine production

phagocytosis

neuroinflammatory response

leukocyte chemotaxis

behavioral defense response

Lipid Metabolism

lipid transport

lipid storage

fatty acid biosynthetic process

fatty acid metabolic process

lipid catabolic process

phosphatidylinositol metabolic process

phospholipid binding

glycerolipid metabolic process

cholesterol metabolic process

triglyceride metabolic process

glycolipid metabolic process

sphingolipid metabolic process

phosphatidylcholine metabolic process

phosphatidylserine metabolic process

Metal Binding and Homeostasis

cellular response to metal ion

regulation of metal ion transport

metal ion binding

oxidoreductase activity, acting on metal ions

metallopeptidase activity

Mitochondrial Metabolism

proton motive force-driven mitochondrial ATP synthesis

mitochondrial transport

mitochondrial membrane organization

apoptotic mitochondrial changes

electron transport chain

tricarboxylic acid cycle

mitochondrial translation

mitochondrion localization

mitochondrial calcium ion homeostasis

autophagy of mitochondrion

energy derivation by oxidation of organic compounds

Myelination

oligodendrocyte development

Schwann cell development

axon ensheathment

axon regeneration

Oxidative Stress

reactive oxygen species metabolic process

response to oxidative stress

intrinsic apoptotic signaling pathway in response to oxidative stress

Proteostasis

translation

proteasomal protein catabolic process

Golgi organization

endoplasmic reticulum

response to endoplasmic reticulum stress

vesicle-mediated transport to the plasma membrane

RNA Spliceosome

translation

proteasomal protein catabolic process

Golgi organization

endoplasmic reticulum

response to endoplasmic reticulum stress

vesicle-mediated transport to the plasma membrane

Structural Stabilization

actin filament organization

cell-substrate adhesion

microtubule polymerization or depolymerization

cell-cell junction organization

extracellular matrix organization

Synapse

trans-synaptic signaling

axon regeneration

axonal transport

dendritic transport

synaptic vesicle cycle

postsynapse organization

presynapse organization

learning or memory

Tau Homeostasis

tau-protein kinase activity

neurofibrillary tangle assembly

Vasculature

cardiac conduction

angiogenesis

endothelial cell migration

vasodilation

vascular associated smooth muscle cell development

maintenance of blood-brain barrier

vascular endothelial growth factor receptor signaling pathway

vascular associated smooth muscle cell apoptotic process

#dev.off()

Subdomain Expression Summary

The following plot shows a summary of the mean expression of the subdomains in each genotype. Because there are so many subdomains, we subset to only those with a |PC1| value greater than 0.5.

sbd.decomp <- plot.decomp(sbd.means, plot.results = FALSE)
big.idx <- which(abs(sbd.decomp$u[,1]) > 0.1)
row.order <- order(sbd.decomp$u[big.idx,1])

#png("~/Desktop/sbd_overview.png", width = 15, height = 8, units = "in", res = 300)
layout.mat <- matrix(c(1,2), nrow = 1)
layout(layout.mat, widths = c(1, 0.5))
par(mar = c(4,35, 2,1))
imageWithText(signif(sbd.means[big.idx[row.order],], 2), split.at.vals = TRUE, 
  col.scale = c("blue", "brown"), grad.dir = "ends", row.text.shift = 0.15,
  col.text.shift = 0.05)
par(mar = c(4,1,2,2))
barplot(sbd.decomp$u[big.idx[row.order],1], horiz = TRUE, xlab = "PC1")
plot.dim  <- par("usr")
segments(x0 = c(-0.1, 0, 0.1), y0 = 0, y1 = plot.dim[4], lty = c(2,1,2))

#dev.off()

KEGG

KEGG pathways are AD-agnostic and are organized into different functional groups that could be informative. We also looked at gene expression based on KEGG pathways.

mouse.kegg.file <- file.path(general.data.dir, "KEGG.Mouse.RDS")
if(!file.exists(mouse.kegg.file)){
    all.mouse.kegg <- download_KEGG("mmu", "KEGG", "kegg")
    saveRDS(all.mouse.kegg, mouse.kegg.file)
}else{
    all.mouse.kegg <- readRDS(mouse.kegg.file)
}

human.kegg.file <- file.path(general.data.dir, "KEGG.Human.RDS")
if(!file.exists(human.kegg.file)){
    all.human.kegg <- download_KEGG("hsa", "KEGG", "kegg")
    saveRDS(all.human.kegg, human.kegg.file)
}else{
    all.human.kegg <- readRDS(human.kegg.file)
}


#convert mouse entrez IDs to ensembl IDs
u_path <- gsub(" - Mus musculus (house mouse)", "", all.mouse.kegg[[2]][,"to"], fixed = TRUE)

path.id <- all.mouse.kegg[[2]][,1]
path.idx <- lapply(path.id, function(x) which(all.mouse.kegg[[1]][,1] == unlist(x)[1]))
path.gene.id <- lapply(path.idx, function(x) all.mouse.kegg[[1]][x,2])
names(path.gene.id) <- u_path

mouse.path.gene.ensembl <- lapply(path.gene.id, 
    function(x) mouse.gene.table[match(x, mouse.gene.table[,"entrezgene_id"]), "ensembl_gene_id"])

saveRDS(mouse.path.gene.ensembl, file.path(general.processed.data.dir, "Mouse_KEGG_for_GSEA.RDS"))


#convert human entrez IDs to ensembl IDs
u_path <- all.human.kegg[[2]][,"to"]
path.id <- all.human.kegg[[2]][,1]
path.idx <- lapply(path.id, function(x) which(all.human.kegg[[1]][,1] == unlist(x)[1]))
path.gene.id <- lapply(path.idx, function(x) all.human.kegg[[1]][x,2])
names(path.gene.id) <- u_path

#use the ortholog table to convert. We only care about genes
#that have mouse orthologs anyway (I think).
hum.path.gene.ensembl <- lapply(path.gene.id, 
    function(x) orthos[match(x, orthos[,"Human.Entrez"]), "Human.Ensembl"])

saveRDS(hum.path.gene.ensembl, file.path(general.processed.data.dir, "Human_KEGG_for_GSEA.RDS"))

For each mouse KEGG pathway, is there differential expression across genotypes? The results are printed to pdfs in the results folder. There are too many KEGG pathways to display here.

mean.kegg.vals <- vector(mode = "list", length = length(mouse.path.gene.ensembl))
names(mean.kegg.vals) <- names(mouse.path.gene.ensembl)

for(k in 1:length(mouse.path.gene.ensembl)){
  k.vals <- lapply(1:length(mouse.path.gene.ensembl[[k]]), 
    function(x) plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat, 
      tx_name = mouse.path.gene.ensembl[[k]][x], 
      tx_label = NA, ylab = "Expression (A.U.)", plot.results = FALSE)[[1]])

  has.vals <- which(sapply(k.vals, length) > 0)
  k.mean <- t(sapply(k.vals[has.vals], function(x) sapply(x[[1]], mean)))
  #pheatmap(k.mean)
  mean.kegg.vals[[k]] <- k.mean
}

test.diff <- lapply(mean.kegg.vals, aov.by.matrix)
test.p <- sapply(test.diff, function(x) x$"Pr(>F)"[1])
#qqunif.plot(test.p)

mean.mean <- t(sapply(mean.kegg.vals, colMeans))

#pdf(file.path(results.dir, "kegg.pdf"), width = 7, height = 42)
#pheatmap(mean.mean, cluster_cols = FALSE)
#dev.off()

term.name = "Virion - Rotavirus"
term.name = "Ribosome"
term.name = "Biotin metabolism"
term.idx <- which(names(mean.kegg.vals) == term.name)
#boxplot(mean.kegg.vals[[term.idx]], main = term.name, ylab = "Scaled Expression")
#pheatmap(mean.kegg.vals[[term.idx]], main = term.name, show_rownames = FALSE, cluster_cols = FALSE)
#mean.kegg.vals[[term.idx]]

Intersections

Matt said he read a paper that suggested that a useful unit of inquiry was the intersections between GO terms and KEGG terms. These intersections carry both functional information and pathway information.

Here we take any intersection with at least 3 genes

intersect_bd_kegg <- function(bd.list, kegg.list){

  intersect.list <- vector(mode = "list", length = length(bd.list))
  names(intersect.list) <- names(bd.list)

  for(bd in 1:length(bd.list)){
    k.intersections <- vector(mode = "list", length = length(kegg.list))
    names(k.intersections) <- names(kegg.list)  
    
    for(k in 1:length(kegg.list)){
      k.intersections[[k]] <- intersect(bd.list[[bd]], kegg.list[[k]])
    }

    has.genes <- which(sapply(k.intersections, length) >= min.term.size)
    intersect.list[[bd]] <- k.intersections[has.genes]
  }
  return(intersect.list)
}

#save grouped form, so we can do the boxplot thingy that Greg Cary does.
mouse.bd.k.intersections <- intersect_bd_kegg(bd.list = bd.for.gsea, kegg.list = mouse.path.gene.ensembl)
saveRDS(mouse.bd.k.intersections, file.path(general.processed.data.dir, "Mouse_KEGG_Intersections_for_GSEA.RDS"))

human.bd.k.intersections <- intersect_bd_kegg(bd_genes, hum.path.gene.ensembl)
saveRDS(human.bd.k.intersections, file.path(general.processed.data.dir, "Human_KEGG_Intersections_for_GSEA.RDS"))
mean.bd.k.file <- file.path(results.dir, "KEGG_BD_mean_expression.RDS")

if(!file.exists(mean.bd.k.file)){
  mean.bd.k.vals <- vector(mode = "list", length = length(mouse.path.gene.ensembl))
  names(mean.bd.k.vals) <- names(mouse.path.gene.ensembl)

  #for each kegg pathway
  for(k in 1:length(mouse.path.gene.ensembl)){
    kegg.path <- names(mouse.path.gene.ensembl)[k]
    has.intersection <- which(sapply(lapply(mouse.bd.k.intersections, function(x) which(names(x) == kegg.path)), length) > 0)
    if(length(has.intersection) == 0){next()}

    intersect.mean.vals <- vector(mode = "list", length = length(has.intersection))
    names(intersect.mean.vals) <- names(has.intersection)

    all.val.mat <- matrix(NA, nrow = length(mouse.bd.genes), ncol = 5)
    rownames(all.val.mat) <- names(mouse.bd.genes)
    colnames(all.val.mat) <- genotypes[1:5]

    #pull out all gene values for intersections with biodomains
    for(bd in 1:length(has.intersection)){
      kegg.idx <- sapply(mouse.bd.k.intersections[has.intersection], function(x) which(names(x) == kegg.path))
      
      intersect.ids <- mouse.bd.k.intersections[[has.intersection[bd]]][[kegg.idx[bd]]]
      bd.k.vals <- lapply(intersect.ids, 
        function(x) plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat, 
        tx_name = x, tx_label = NA, ylab = "Expression (A.U.)", plot.results = FALSE)[[1]])

      has.vals <- which(sapply(bd.k.vals, length) > 0)
      k.mean <- t(sapply(bd.k.vals[has.vals], function(x) sapply(x[[1]], mean)))
      #pheatmap(k.mean)
      rownames(k.mean) <- intersect.ids[has.vals]
      intersect.mean.vals[[bd]] <- k.mean
    }
    has.vals <- which(sapply(intersect.mean.vals, length) > 0)
    mean.mat <- t(sapply(intersect.mean.vals[has.vals], colMeans))
    all.val.mat[rownames(mean.mat), colnames(mean.mat)] <- mean.mat[rownames(mean.mat), colnames(mean.mat)]
    mean.bd.k.vals[[k]] <- all.val.mat
    #pheatmap(mean.bd.k.vals[[k]], main = names(path.gene.ensembl)[k], cluster_cols = FALSE, cluster_rows = FALSE)
  }
  saveRDS(mean.bd.k.vals, mean.bd.k.file)
}else{
  mean.bd.k.vals <- readRDS(mean.bd.k.file)
}

has.vals <- which(sapply(mean.bd.k.vals, length) > 0)
min.val <- min(unlist(mean.bd.k.vals[has.vals]), na.rm = TRUE)
max.val <- max(unlist(mean.bd.k.vals[has.vals]), na.rm = TRUE)
#custom.dist <- unlist(mean.bd.k.vals[has.vals])
custom.dist <- rnorm(1000, 0, 0.2)
#hist(custom.dist)

kegg.plot.dir <- file.path(results.dir, "KEGG_plots")
if(!file.exists(kegg.plot.dir)){dir.create(kegg.plot.dir)}


pdf(file.path(kegg.plot.dir, "kegg_bd_intersections.pdf"), width = 7, height = 7)
hist(unlist(mean.bd.k.vals), xlab = "Mean Value", main = "Mean Biodomain-Kegg Intersection Expression")

for(i in 1:length(mean.bd.k.vals)){
  if(length(mean.bd.k.vals[[i]]) > 0){
    par(mar = c(4,12,2,2))
    neg.vals <- length(which(mean.bd.k.vals[[i]] < 0))
    pos.vals <- length(which(mean.bd.k.vals[[i]] > 0))
    if(neg.vals > 0 && pos.vals > 0){
      col.scale = c("blue", "brown")
      grad.dir = "ends"
      split.at.vals = TRUE
    }
    if(neg.vals > 0 && pos.vals == 0){
      col.scale = "blue"
      grad.dir = "low"
      split.at.vals = TRUE
    }
    if(neg.vals == 0 && pos.vals > 0){
      col.scale = "brown"
      grad.dir = "high"
      split.at.vals = TRUE
    }

  imageWithText(mat = mean.bd.k.vals[[i]], main = names(mean.bd.k.vals)[i], 
      col.scale = col.scale, grad.dir = grad.dir, col.text.rotation = 0, 
      col.text.adj = 0.5, col.text.shift = 0.06, row.text.shift = 0.15, 
      main.shift = 0.05, color.fun = "linear", split.at.vals = split.at.vals,
      light.dark = "f", global.color.scale = TRUE, global.min = min.val, 
      global.max = max.val)

  }
}
dev.off()
## quartz_off_screen 
##                 2
global.min <- min(unlist(mean.bd.k.vals), na.rm = TRUE)
global.max <- max(unlist(mean.bd.k.vals), na.rm = TRUE)
#plot all results for each biodomain on one page
for(bd in 1:length(bd_genes)){
  bd.mat <- t(sapply(mean.bd.k.vals, function(x) if(length(x) > 0){x[bd,]}else{rep(NA, 5)}))
  has.vals <- which(!is.na(rowMeans(bd.mat)))

  #plot.decomp(t(bd.mat[has.vals,]), label.points = TRUE, main = names(bd_genes)[bd])
  #bd.decomp <- plot.decomp(bd.mat[has.vals,], main = names(bd_genes)[bd])
  #plot.decomp(bd.mat[has.vals,], main = names(bd_genes)[bd], label.points = TRUE)

  if(length(has.vals) > 0){
    png(file.path(kegg.plot.dir, paste0("bd_kegg_intersections_", names(bd_genes)[bd], ".png")), 
      width = 7, height = max(c(length(has.vals)/7, 7)), units = "in", res = 200)
    row.order <- hclust(dist(bd.mat[has.vals,]))$order
    par(mar = c(4,16, 2, 2))
    test <- imageWithText(mat = bd.mat[has.vals[row.order],], main = names(bd_genes)[bd], 
        col.scale = c("blue", "brown"), grad.dir = "ends", col.text.rotation = 0, 
        col.text.adj = 0.5, col.text.shift = 0.01, row.text.shift = 0.15, 
        main.shift = 0.01, color.fun = "linear", split.at.vals = TRUE,
        color.dist = NULL, light.dark = "f", global.color.scale = TRUE,
        global.min = global.min, global.max = global.max)
    dev.off()
  }
}

We put all the results together to see if there were any combinations that stood out overall. The plot below shows that the expression across all KEGG pathway and biodomain intersections separates the genotypes nicely. Along the first principal component, the VS genotypes are negative, the WT mice are at 0, and the FC genotypes are positive.

big.mat <- Reduce("rbind", mean.bd.k.vals)
group.names <- unlist(lapply(1:length(mean.bd.k.vals), function(x) if(length(mean.bd.k.vals[[x]] > 0)){paste(names(mean.bd.k.vals)[x], rownames(mean.bd.k.vals[[x]]), sep = "--")}))
rownames(big.mat) <- group.names
#pdf("~/Desktop/big.mat.pdf", height = 20, width = 10)
#pheatmap(big.mat, cluster_rows = FALSE, cluster_cols = FALSE)
#dev.off()
not.na <- which(!is.na(rowSums(big.mat)))

sub.mat <- big.mat[not.na,]
process.decomp <- plot.decomp(t(sub.mat), label.points = TRUE, xlim = c(-1, 1), 
  cols = geno.cols[match(names(geno.cols), colnames(sub.mat))], cex = 1.5)

process.vals <- process.decomp$v

pc.order <- order(process.decomp$u[,1])
barplot(process.decomp$u[pc.order,1], 
  col = geno.cols[match(names(geno.cols), colnames(sub.mat))][pc.order], 
  names = colnames(sub.mat)[pc.order], ylab = "PC1")
abline(h = 0)

#process.decomp <- plot.decomp(big.mat[has.vals,], label.points = TRUE)
#plot(process.decomp$v, type = "n")
#text(process.decomp$v[,1], process.decomp$v[,2], labels = colnames(big.mat))
#process.vals <- process.decomp$u

The processes don’t have any particular clustering as shown below. But there are some pathways that have large negative and positive values in the first PC.

plot(process.decomp$v, xlab = "PC1", ylab = "PC2", pch = 16)
large.neg <- which(process.decomp$v[,1] < -0.04)
large.pos <- which(process.decomp$v[,1] > 0.05)
text(process.decomp$v[large.pos,1], process.decomp$v[large.pos,2], labels = rownames(sub.mat)[large.pos], cex = 0.5)
text(process.decomp$v[large.neg,1], process.decomp$v[large.neg,2], labels = rownames(sub.mat)[large.neg], cex = 0.5)

The plots below show the mean expression for each of these processes with large PC values.

pc.order <- order(process.vals[,1], decreasing = TRUE)
most.neg <- tail(pc.order, 20)
most.pos <- head(pc.order, 20)
#most.pos <- pc.order[1620:1630] #check middle

#test.mat <- process.decomp$v[most.neg,1,drop=FALSE]
#rownames(test.mat) <-  rownames(sub.mat)[most.neg]
#test.mat[order(test.mat[,1]),]

par(mar = c(4, 24, 2, 2))
layout(matrix(c(1,2), ncol = 2), widths = c(1, 0.5))
imageWithText(sub.mat[rev(most.neg),], col.scale = c("blue", "brown"), split.at.vals = TRUE,
  grad.dir = "ends", col.text.rotation = 0, col.text.adj = 0.5, col.text.shift = 0.05, row.text.cex = 0.7,
  row.text.shift = 0.15, main = "Large Negative PC pathways")
par(mar = c(3.8, 0, 1.8, 2))
barplot(process.vals[most.neg,1], horiz = TRUE)
mtext("PC1", side = 1, line = 2.2)

#test.mat <- process.decomp$v[most.pos,1,drop=FALSE]
#rownames(test.mat) <-  rownames(sub.mat)[most.pos]
#test.mat[order(test.mat[,1]),]

par(mar = c(4, 18, 2, 0))
#row.order <- hclust(dist(sub.mat[most.pos,]))$order
layout(matrix(c(1,2), ncol = 2), widths = c(1, 0.5))
imageWithText(sub.mat[most.pos,], col.scale = c("blue", "brown"), split.at.vals = TRUE,
  grad.dir = "ends", col.text.rotation = 0, col.text.adj = 0.5, col.text.shift = 0.05, row.text.cex = 0.7,
  row.text.shift = 0.15, main = "Large Positive PC pathways")
par(mar = c(3.8, 0, 1.8, 2))
barplot(process.vals[rev(most.pos),1], horiz = TRUE)
mtext("PC1", side = 1, line = 2.2)

Can we build a biodomain network that shows us where the biggest differences are? The biodomains were designed such

#This is quite derived, but let's see what happens when we collect
#the loadings on PC1 for each kegg-biodomain pair.

umat <- matrix(0, nrow = length(mouse.path.gene.ensembl), ncol = length(bd_genes))
rownames(umat) <- names(mouse.path.gene.ensembl)
colnames(umat) <- names(bd_genes)
split.names <- strsplit(rownames(sub.mat), "--")
split.kegg <- sapply(split.names, function(x) x[1])
split.bd <- sapply(split.names, function(x) x[2])
for(i in 1:nrow(process.vals)){
  umat[split.kegg[i], split.bd[i]] <- process.vals[i,1]
}

#pheatmap(umat)
no.vals <- which(rowSums(umat) == 0)
has.vals <- which(rowSums(umat) != 0)
row.order <- order(rowSums(umat[has.vals,]))
col.order <- order(colSums(umat[has.vals,]))

pdf(file.path(kegg.plot.dir, "KEGG_Umat.pdf"), width = 9, height = 40)
pheatmap(umat[has.vals[row.order],col.order], cluster_cols = TRUE, cluster_rows = TRUE)

dev.off()
## pdf 
##   3

The following image shows just the rows with strong loadings for a given biodomain-KEGG intersection.

#filter to rows with the largest values
#hist(umat)
lower.thresh <- -0.04
upper.thresh <- 0.04
high.val <- which(apply(umat, 1, function(x) any(x > upper.thresh)))
low.val <- which(apply(umat, 1, function(x) any(x < lower.thresh)))
strong.vals <- union(high.val, low.val)

pheatmap(umat[strong.vals,])

#pdf("~/Desktop/sub_mat.pdf", width = 8, height = 9)
#pheatmap(umat[strong.vals,])
#dev.off()

#net <- graph_from_incidence_matrix(abs(umat[strong.vals,]))
#plot(net)

#pdf("~/Desktop/bip_plot.pdf", width = 10, height = 10)
#plot_bipartite(abs(umat[strong.vals,]))
#dev.off()
if(need.to.download){
  synLogout()
}